pacman::p_load(dplyr,
               scales,
               readxl,
               zoo,
               csvy,
               tidyr,
               lme4,
               jtools,
               tidycensus,
               tidyverse,
               stringr,
               cregg,
               readr,
               purrr,
               covidcast,
               broom)

# data for SECTION 1 ----

# GSS data (see README for note about data access)
# 
# dat <- read.csv("data/raw/gss.csv") %>%
#   filter(race%in%c("White", "Black")) %>%
#   mutate(racedif1 = case_when(racdif1=="YES"~1,
#                               racdif1=="NO"~0,
#                               T~NA_real_),
#          racedif2 = case_when(racdif2=="YES"~1,
#                               racdif2=="NO"~0,
#                               T~NA_real_),
#          racedif3 = case_when(racdif3=="YES"~1,
#                               racdif3=="NO"~0,
#                               T~NA_real_),
#          racedif4 = case_when(racdif4=="YES"~1,
#                               racdif4=="NO"~0,
#                               T~NA_real_),
#          party = case_when(grepl("democrat", partyid)~"Democrat",
#                            grepl("republican", partyid)~"Republican",
#                            T~"Independent/Other/Refused"),
#          party7 = case_when(partyid=="Strong democrat"~"Strong Democrat",
#                             partyid=="Not very strong democrat"~"Moderate Democrat",
#                             partyid=="Independent, close to democrat"~"Lean Democrat",
#                             partyid=="Independent (neither, no response)"~"Independent",
#                             partyid=="Independent, close to republican"~"Lean Republican",
#                             partyid=="Not very strong republican"~"Moderate Republican",
#                             partyid=="Strong republican"~"Strong Republican",
#                             T~NA_character_),
#          helpblk = case_when(helpblk=="No special treatment"~0,
#                              helpblk=="Government should help"~1,
#                              helpblk=="Agree with both"~0.5,
#                              helpblk=="2"~0.75,
#                              helpblk=="4"~0.25,
#                              T~NA_real_),
#          natrace = case_when(natrace=="TOO LITTLE"~1,
#                              natrace=="TOO MUCH"~0,
#                              natrace=="ABOUT RIGHT"~0.5,
#                              T~NA_real_),
#          natracey = case_when(natracey=="TOO LITTLE"~1,
#                               natracey=="TOO MUCH"~0,
#                               natracey=="ABOUT RIGHT"~0.5,
#                               T~NA_real_),
#          race = race,
#          closeblk = case_when(closeblk=="NOT AT ALL CLOSE"~0,
#                               closeblk=="2"~0.125,
#                               closeblk=="3"~0.25,
#                               closeblk=="4"~0.375,
#                               closeblk=="NEITHER ONE OR THE OTHER"~0.5,
#                               closeblk=="6"~0.625,
#                               closeblk=="7"~0.75,
#                               closeblk=="8"~0.875,
#                               closeblk=="VERY CLOSE"~1,
#                               T~NA_real_),
#          closewht = case_when(closewht=="NOT AT ALL CLOSE"~0,
#                               closewht=="2"~0.125,
#                               closewht=="3"~0.25,
#                               closewht=="4"~0.375,
#                               closewht=="NEITHER ONE OR THE OTHER"~0.5,
#                               closewht=="6"~0.625,
#                               closewht=="7"~0.75,
#                               closewht=="8"~0.875,
#                               closewht=="VERY CLOSE"~1,
#                               T~NA_real_),
#          affrmact = case_when(affrmact=="Strongly favors"~1,
#                               affrmact=="Not strongly favors"~0.67,
#                               affrmact=="Not strongly opposes"~0.33,
#                               affrmact=="Strongly opposes"~0,
#                               T~NA_real_),
#          wrkwayup = case_when(wrkwayup=="Disagree strongly"~0,
#                               wrkwayup=="Disagree somewhat"~0.25,
#                               wrkwayup=="Neither agree nor disagree"~0.5,
#                               wrkwayup=="Agree somewhat"~0.75,
#                               wrkwayup=="Agree strongly"~1,
#                               T~NA_real_),
#          wlthblks = case_when(wlthblks=="POOR"~1,
#                               wlthblks=="RICH"~0,
#                               wlthblks=="4"~0.5,
#                               wlthblks=="2"~1/6,
#                               wlthblks=="3"~1/3,
#                               wlthblks=="5"~2/3,
#                               wlthblks=="6"~5/6,
#                               T~NA_real_),
#          workblks = case_when(workblks=="LAZY"~1,
#                               workblks=="HARDWORKING"~0,
#                               workblks=="4"~0.5,
#                               workblks=="2"~1/6,
#                               workblks=="3"~1/3,
#                               workblks=="5"~2/3,
#                               workblks=="6"~5/6,
#                               T~NA_real_),
#          intlblks = case_when(intlblks=="UNINTELLIGENT"~0,
#                               intlblks=="INTELLIGENT"~1,
#                               intlblks=="4"~0.5,
#                               intlblks=="2"~1/6,
#                               intlblks=="3"~1/3,
#                               intlblks=="5"~2/3,
#                               intlblks=="6"~5/6,
#                               T~NA_real_),
#          discaff = case_when(discaff=="Not very likely"~0,
#                              discaff=="Somewhat likely"~0.5,
#                              discaff=="Very likely"~1,
#                              T~NA_real_),
#          ideo = case_when(polviews=="Extremely liberal"~0,
#                           polviews=="Liberal"~0.17,
#                           polviews=="Slightly liberal"~0.33,
#                           polviews=="Moderate, middle of the road"~0.5,
#                           polviews=="Slightly conservative"~0.67,
#                           polviews=="Conservative"~0.83,
#                           polviews=="Extremely conservative"~1,
#                           T~NA_real_),
#          ineqdisc = case_when(racdif1=="YES"~1,
#                               racdif1=="NO"~0,
#                               T~NA_real_),
#          weight = wtssps
#   ) %>%
#   rowwise() %>%
#   mutate(white_black_diff = closewht - closeblk) %>%
#   filter(!is.na(party)) %>%
#   dplyr::select(c(year, race, party, weight, wrkwayup, ineqdisc, helpblk)) %>%
#   mutate(wrkwayup = rescale(wrkwayup, to = c(1,0))) %>%
#   pivot_longer(!c(year, race, party, weight), names_to = "variable", values_to = "value") %>%
#   filter(!is.na(party)&!is.na(value)) %>%
#   mutate(group = paste(race, party, sep = " ")) %>%
#   filter(group %in% c("Black Democrat", "White Democrat", "White Republican")) %>%
#   group_by(group, year, variable, party, race) %>%
#   summarise(value = weighted.mean(value, na.rm = T, weights = weight))
# 
# write.csv(dat, "data/clean/gss_yearmeans.csv")
#
# ANES data (see README for note about data access)
# 
# anesdat <- fread("data/raw/anes_timeseries_cdf_csv_20220916.csv",
#              select = c("VCF9274", "VCF9275", # Black influence
#                         "VCF0301", "VCF0105b", # partisanship and race
#                         "VCF0004", "VCF0009z", # year and weight
#                         "VCF0206")) %>% # Black feeling thermometer
#   filter(VCF0105b%in%c(1,2)&VCF0301%in%c(1,2,3,5,6,7)) %>%
#   mutate(inf_blacks = case_when(VCF9274==1~"Too much influence",
#                                 VCF9274==2~"Just about the right amount",
#                                 VCF9274==3~"Too little influence",
#                                 VCF9275==1~"Too much influence",
#                                 VCF9275==2~"Just about the right amount",
#                                 VCF9275==3~"Too little influence"),
#          inf_blacks_num = case_when(inf_blacks=="Too much influence"~-1,
#                                     inf_blacks=="Just about the right amount"~0,
#                                     inf_blacks=="Too little influence"~1),
#          race = case_when(VCF0105b==1~"White",
#                           VCF0105b==2~"Black"),
#          pid3 = case_when(VCF0301%in%c(1,2,3)~"Democrat",
#                           T~"Republican"),
#          year = VCF0004,
#          weight = VCF0009z,
#          resptype = paste(race, pid3),
#          therm_blacks = case_when(VCF0206%in%c(98, 99)~NA_real_,
#                                   T~VCF0206),
#          therm_blacks = rescale(therm_blacks, to = c(0,1))) %>%
#   filter(resptype!="Black Republican") %>%
#   select(c(inf_blacks, inf_blacks_num, race, pid3, year, weight, resptype, therm_blacks)) %>%
#   mutate(inf_blacks_num = rescale(inf_blacks_num, to = c(0,1))) %>%
#   group_by(year, resptype) %>%
#   summarize(yearmean = weighted.mean(inf_blacks_num, weight = weight, na.rm = T),
#             n = sum(weight)) %>%
#   ungroup() %>%
#   filter(!is.na(yearmean)) %>%
#   rename(group = resptype,
#          value = yearmean) %>%
#   mutate(variable = "Blacks have (too much — too little) influence in politics")
# 
# write.csv(anesdat, "data/clean/anes_yearmeans_blackinf.csv")

presvote08 <- read_excel("data/raw/legislators/115th Congress Members Guide with Elections and Demographic Data by District.xlsx", sheet = 1) %>%
  select("Code", "2008 President...19", "...20")
presvote08 <- presvote08[3:nrow(presvote08),]
colnames(presvote08) <- c("district", "dem_vote_share", "rep_vote_share")
presvote08 <- presvote08 %>% 
  mutate(dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
         district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district))

presvote12 <- read_excel("data/raw/legislators/115th Congress Members Guide with Elections and Demographic Data by District.xlsx", sheet = 1) %>%
  select("Code", "2012 President...17", "...18")
presvote12 <- presvote12[3:nrow(presvote12),]
colnames(presvote12) <- c("district", "dem_vote_share", "rep_vote_share")
presvote12 <- presvote12 %>% mutate(dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
                                    district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district))


# Block to CD crosswalk for 2010
# (had to download by state)
# https://www.census.gov/geographies/reference-files/time-series/geo/block-assignment-files.2010.html#list-tab-361828852

filelist = list.files(path = "data/raw/BAF 2010", pattern = ".*.txt")
filelist = paste("data/raw/BAF 2010/", filelist, sep = "")
datalist = lapply(filelist, FUN=read.table, header=TRUE)
datafr = do.call("rbind", datalist) 

blocks1 <- as.data.frame(str_split_fixed(datafr$BLOCKID.DISTRICT, ",", 2)) %>%
  mutate(state = substr(V1, 1, 2), # two-digit state fips
         county = substr(V1, 3, 5), # three-digit county code
         tract = substr(V1, 6, 11), # six-digit tract code
         blockgrp = substr(V1, 12, 12), # three-digit block code
         district = V2,
         GEOID = substr(V1, 1, 12)) %>%
  filter(district!="") %>%
  select(c(GEOID, state, district, blockgrp)) %>%
  filter(state > 0 & state <= 56) %>%
  distinct()



# 2010 ----

congress112 <- read.csv("data/raw/legislators/term-112.csv") %>%
  filter(!grepl("Ron Barber|Harman|Massie|Curson|Donald M. Payne, Jr.|Hochul|Robert L. Turner|David Wu|DelBene", name)) %>%
  select(area_id, name, gender, group_id) %>%
  rename(district = area_id,
         party = group_id) %>%
  mutate(black = case_when(grepl("Cummings|Sewell|Barbara Lee|Maxine Waters|Karen Bass|
                                 |Laura Richardson|
                                 |Eleanor Holmes Norton|
                                 |Corrine Brown|
                                 |Alcee L. Hastings|
                                 |Allen|
                                 |Frederica|
                                 |Sanford D. Bishop, Jr.|
                                 |John Lewis|
                                 |David Scott|
                                 |Henry C.|
                                 |Danny|
                                 |Jesse|
                                 |Bobby L. Rush|
                                 |Carson|
                                 |Cedric|
                                 |Donna|
                                 |Conyers|
                                 |Ellison|
                                 |Bennie|
                                 |Clay|
                                 |Cleaver|
                                 |Payne|
                                 |Meeks|
                                 |Clarke|
                                 |Rangel|
                                 |Butterfield|
                                 |Watt|
                                 |Fudge|
                                 |Fattah|
                                 |Clyburn|
                                 |Al Green|
                                 |Sheila Jackson Lee|
                                 |Eddie Bernice Johnson|
                                 |Christensen|
                                 |Robert C. \"Bobby\" Scott|
                                 |Gwen Moore", name)~1,
                           T~0),
         poc = case_when(black==1~1,
                         grepl("Tom Cole|
                               |Solis|
                               |Ortiz|
                               |Diaz|
                               |Salazar|
                               |Rodriguez|
                               |Reyes|
                               |Gonzalez|
                               |Baca|
                               |Rivera|
                               |Canseco|
                               |Pastor|
                               |Negrete|
                               |Garcia|
                               |Gallego|
                               |Rangel|
                               |Sanchez|
                               |Hinojosa|
                               |Becerra|
                               |Lujan|
                               |Ros-Lehtinen|
                               |Gutiérrez|
                               |Labrador|
                               |Curbelo|
                               |Kihuen|
                               |Serrano|
                               |Luján|
                               |Flores|
                               |Torres|
                               |Mucarsel|
                               |Cisneros|
                               |Vela|
                               |Delgado|
                               |Velázquez|
                               |Napolitano|
                               |Sánchez|
                               |Grijalva|
                               |Cuellar|
                               |Sires|
                               |Garamendi|
                               |Beutler|
                               |Vargas|
                               |Ruiz|
                               |Castro|
                               |Cárdenas|
                               |Torres|
                               |Mooney|
                               |Gallego|
                               |Aguillar|
                               |Soto|
                               |Mast|
                               |Gonzalez|
                               |Espaillat|
                               |Correa|
                               |Carbajal|
                               |Barragán|
                               |Gomez|
                               |Cortez|
                               |Levin|
                               |Gonzalez|
                               |Garcia|
                               |García|
                               |Torres|
                               |Salazar|
                               |Malliotakis|
                               |Fernandez|
                               |Giménez|
                               |Flores|
                               |Djou|
                               |Cao|
                               |Wu|
                               |Hansen Clarke|
                               |Austria|
                               |Hirono|
                               |Hanabusa|
                               |Takai|
                               |Duckworth|
                               |Honda|
                               |TJ Cox|
                               |Gabbard|
                               |Marilyn Strickland|
                               |Michelle Steel|
                               |Young Kim|
                               |Kahele|
                               |Andy Kim|
                               |Stephanie Murphy|
                               |Krishnamoorthi|
                               |Khanna|
                               |Jayapal|
                               |Lieu|
                               |Takano|
                               |Meng|
                               |Bera|
                               |Chu|
                               |Matsui|
                               |Robert C.", name)~1,
                         T~0)) %>%
  select(c(district, black, party)) %>%
  mutate(year = 2010,
         district = case_when(grepl("-0", district)~stringr::str_replace(district, "-0", "-01"), T~district),
         district = case_when(nchar(district)==4~gsub("-", "-0", district),
                              T~district))

# 2010 CVAP by race and census block
cvap_block2010 <- read.csv("data/raw/cvap_blockgroup_2010.csv") %>%
  filter(LNTITLE%in%c("Total", "White Alone", "Black or African American Alone")) %>%
  select(LNTITLE, GEOID, CVAP_EST) %>%
  pivot_wider(names_from = LNTITLE, values_from = CVAP_EST) %>%
  mutate(GEOID = gsub("15000US", "", GEOID),
         state = substr(GEOID, 1, 2),
         county = substr(GEOID, 3, 5),
         tract = substr(GEOID, 6, 11),
         blockgroup = substr(GEOID, 12, 12)) %>%
  filter(state%in%c(blocks1$state)) %>%
  left_join(blocks1)

dat10 <- cvap_block2010 %>%
  filter(district!="ZZ"&!is.na(state)&!is.na(district)) %>%
  group_by(state, district) %>%
  summarize(total = sum(Total, na.rm = T),
            black = sum(`Black or African American Alone`, na.rm = T),
            white = sum(`White Alone`, na.rm = T)) %>%
  ungroup() %>%
  mutate(pct_white = white / total,
         pct_black = black / total,
         district = case_when(district=="00"~"01", T~district)) %>%
  mutate(year = 2010,
         state = fips_to_abbr(state),
         district = paste(state, "-", district, sep = "")) %>%
  filter(state!="DC") %>%
  select(c(year, district, pct_white, pct_black)) %>%
  left_join(congress112) %>%
  left_join(presvote08) %>%
  select(-rep_vote_share)

# Block to CD crosswalk for 2012 and 2014
# https://www.census.gov/geographies/mapping-files/2015/dec/rdo/114-congressional-district-bef.html
blocks <- read.table("data/raw/National_CD114.txt")
blocks2 <- as.data.frame(str_split_fixed(blocks$V1, ",", 2)) %>%
  mutate(state = substr(V1, 1, 2), # two-digit state fips
         county = substr(V1, 3, 5), # three-digit county code
         tract = substr(V1, 6, 11), # six-digit tract code
         block = substr(V1, 12, 15), # three-digit block code
         district = V2,
         GEOID = substr(V1, 1, 12)) %>%
  select(c(GEOID, state, district, block)) %>%
  filter(state > 0 & state <= 56)

# 2012 ----

# 2012 CVAP by race by census block
# https://www.census.gov/programs-surveys/decennial-census/about/voting-rights/cvap.2018.html#list-tab-1518558936
cvap_block2012 <- read.csv("data/raw/cvap_block_2012.csv") %>%
  filter(LNTITLE%in%c("Total", "White Alone", "Black or African American Alone")) %>%
  select(LNTITLE, GEOID, CVAP_EST) %>%
  pivot_wider(names_from = LNTITLE, values_from = CVAP_EST) %>%
  mutate(GEOID = gsub("15000US", "", GEOID)) %>%
  full_join(blocks2)

cvap_district2012 <- cvap_block2012 %>%
  filter(district!="ZZ"&!is.na(state)&!is.na(district)) %>%
  group_by(state, district) %>%
  summarize(total = sum(Total, na.rm = T),
            black = sum(`Black or African American Alone`, na.rm = T),
            white = sum(`White Alone`, na.rm = T)) %>%
  ungroup() %>%
  mutate(pct_white = white / total,
         pct_black = black / total,
         district = case_when(district=="00"~"01", T~district)) %>%
  mutate(year = 2012,
         state = fips_to_abbr(state),
         district = paste(state, "-", district, sep = "")) %>%
  filter(state!="DC") %>%
  select(c(year, district, pct_white, pct_black))

congress113 <- read.csv("data/raw/legislators/term-113.csv") %>%
  filter(!grepl("Bonner|Jolly|Clawson|Rodney Alexander|Jo Ann Emerson|Alma S. Adams|Norcross|Dave Brat", name)) %>%
  select(area_id, name, gender, group_id) %>%
  rename(district = area_id,
         party = group_id) %>%
  mutate(black = case_when(grepl("Cummings|Robin L. Kelly|Sewell|Barbara Lee|Maxine Waters|Karen Bass|
                                 |McLeod|
                                 |Eleanor Holmes Norton|
                                 |Corrine Brown|
                                 |Alcee L. Hastings|
                                 |Frederica|
                                 |Sanford D. Bishop, Jr.|
                                 |John Lewis|
                                 |David Scott|
                                 |Henry C.|
                                 |Danny|
                                 |Bobby L. Rush|
                                 |Carson|
                                 |Cedric|
                                 |Donna|
                                 |Conyers|
                                 |Ellison|
                                 |Bennie|
                                 |Clay|
                                 |Cleaver|
                                 |Payne|
                                 |Meeks|
                                 |Clarke|
                                 |Rangel|
                                 |Jeffries|
                                 |Horsford|
                                 |Butterfield|
                                 |Watt|
                                 |Fudge|
                                 |Beatty|
                                 |Fattah|
                                 |Clyburn|
                                 |Al Green|
                                 |Sheila Jackson Lee|
                                 |Eddie Bernice Johnson|
                                 |Veasey|
                                 |Robert C. \"Bobby\" Scott|
                                 |Gwen Moore", name)~1,
                           T~0),
         poc = case_when(black==1~1,
                         grepl("Tom Cole|
                               |Solis|
                               |Ortiz|
                               |Diaz|
                               |Salazar|
                               |Rodriguez|
                               |Reyes|
                               |Gonzalez|
                               |Baca|
                               |Rivera|
                               |Canseco|
                               |Pastor|
                               |Negrete|
                               |Garcia|
                               |Gallego|
                               |Rangel|
                               |Sanchez|
                               |Hinojosa|
                               |Becerra|
                               |Lujan|
                               |Ros-Lehtinen|
                               |Gutiérrez|
                               |Labrador|
                               |Curbelo|
                               |Kihuen|
                               |Serrano|
                               |Luján|
                               |Flores|
                               |Torres|
                               |Mucarsel|
                               |Cisneros|
                               |Vela|
                               |Delgado|
                               |Velázquez|
                               |Napolitano|
                               |Sánchez|
                               |Grijalva|
                               |Cuellar|
                               |Sires|
                               |Garamendi|
                               |Beutler|
                               |Vargas|
                               |Ruiz|
                               |Castro|
                               |Cárdenas|
                               |Torres|
                               |Mooney|
                               |Gallego|
                               |Aguillar|
                               |Soto|
                               |Mast|
                               |Gonzalez|
                               |Espaillat|
                               |Correa|
                               |Carbajal|
                               |Barragán|
                               |Gomez|
                               |Cortez|
                               |Levin|
                               |Gonzalez|
                               |Garcia|
                               |García|
                               |Torres|
                               |Salazar|
                               |Malliotakis|
                               |Fernandez|
                               |Giménez|
                               |Flores|
                               |Djou|
                               |Cao|
                               |Wu|
                               |Hansen Clarke|
                               |Austria|
                               |Hirono|
                               |Hanabusa|
                               |Takai|
                               |Duckworth|
                               |Honda|
                               |TJ Cox|
                               |Gabbard|
                               |Marilyn Strickland|
                               |Michelle Steel|
                               |Young Kim|
                               |Kahele|
                               |Andy Kim|
                               |Stephanie Murphy|
                               |Krishnamoorthi|
                               |Khanna|
                               |Jayapal|
                               |Lieu|
                               |Takano|
                               |Meng|
                               |Bera|
                               |Chu|
                               |Matsui|
                               |Robert C.", name)~1,
                         T~0)) %>%
  select(c(district, black, party)) %>%
  mutate(year = 2012,
         district = case_when(grepl("-0", district)~stringr::str_replace(district, "-0", "-01"), T~district),
         district = case_when(nchar(district)==4~gsub("-", "-0", district),
                              T~district),
         district = case_when(district=="NC-12"&black==0~NA_character_,
                              T~district)) %>%
  filter(!is.na(district))

dat12 <- cvap_district2012 %>%
  left_join(congress113) %>%
  left_join(presvote12) %>%
  select(-rep_vote_share)

# 2014 ----

# 2014 CVAP by race by census block
# https://www.census.gov/programs-surveys/decennial-census/about/voting-rights/cvap.2018.html#list-tab-1518558936
cvap_block2014 <- read.csv("data/raw/cvap_block_2014.csv") %>%
  filter(lntitle%in%c("Total", "White Alone", "Black or African American Alone")) %>%
  select(lntitle, geoid, CVAP_EST) %>%
  pivot_wider(names_from = lntitle, values_from = CVAP_EST) %>%
  mutate(GEOID = gsub("15000US", "", geoid)) %>%
  full_join(blocks2)

cvap_district2014 <- cvap_block2014 %>%
  filter(district!="ZZ"&!is.na(state)&!is.na(district)) %>%
  group_by(state, district) %>%
  summarize(total = sum(Total, na.rm = T),
            black = sum(`Black or African American Alone`, na.rm = T),
            white = sum(`White Alone`, na.rm = T)) %>%
  ungroup() %>%
  mutate(pct_white = white / total,
         pct_black = black / total,
         district = case_when(district=="00"~"01", T~district)) %>%
  mutate(year = 2014,
         state = fips_to_abbr(state),
         district = paste(state, "-", district, sep = "")) %>%
  filter(state!="DC") %>%
  select(c(year, district, pct_white, pct_black))

# get Black MCs for 2014
congress114 <- read.csv("data/raw/legislators/term-114.csv") %>%
  filter(!grepl("Hanabusa|Schock|Comer|Nunnelee|Warren Davidson|Dwight Evans", name)) %>%
  select(area_id, name, gender, group_id) %>%
  rename(district = area_id,
         party = group_id) %>%
  mutate(black = case_when(grepl("Robin L. Kelly|Sewell|Barbara Lee|Maxine Waters|Karen Bass|
                                 |McLeod|
                                 |Eleanor Holmes Norton|
                                 |Corrine Brown|
                                 |Alcee L. Hastings|
                                 |Frederica|
                                 |Sanford D. Bishop, Jr.|
                                 |John Lewis|
                                 |David Scott|
                                 |Henry C.|
                                 |Danny|
                                 |Bobby L. Rush|
                                 |Carson|
                                 |Cedric|
                                 |Cummings|
                                 |Donna|
                                 |Conyers|
                                 |Lawrence|
                                 |Ellison|
                                 |Bennie|
                                 |Clay|
                                 |Cleaver|
                                 |Payne|
                                 |Bonnie Watson Coleman|
                                 |Meeks|
                                 |Clarke|
                                 |Rangel|
                                 |Jeffries|
                                 |Butterfield|
                                 |Alma|
                                 |Fudge|
                                 |Beatty|
                                 |Fattah|
                                 |Clyburn|
                                 |Al Green|
                                 |Sheila Jackson Lee|
                                 |Eddie Bernice Johnson|
                                 |Veasey|
                                 |Hurd|
                                 |Love|
                                 |Plaskett|
                                 |Robert C. \"Bobby\" Scott|
                                 |Gwen Moore", name)~1,
                           T~0)) %>%
  select(c(district, black, party)) %>%
  mutate(year = 2014,
         district = case_when(grepl("-0", district)~stringr::str_replace(district, "-0", "-01"), T~district),
         district = case_when(nchar(district)==4~gsub("-", "-0", district),
                              T~district),
         district = case_when(district=="PA-02"&black==0~NA_character_, T~district)) %>%
  filter(!is.na(district))

dat14 <- cvap_district2014 %>%
  left_join(congress114) %>%
  left_join(presvote12) %>%
  select(-rep_vote_share)


# 2016 ----

congress115 <- read_excel("data/raw/legislators/115th Congress Members Guide with Elections and Demographic Data by District.xlsx", sheet = 1) %>%
  select(c("Code", "...5", "Party", "Gender", "Race/ Ethnicity", "First Elected", "2016 President...15", "...16", "2012-2016 ACS Citizen Adult Population", "...28"))
congress115 <- congress115[3:nrow(congress115),]
colnames(congress115) <- c("district", "name", "party", "gender", "race", "year_elected", "dem_vote_share", "rep_vote_share", "pct_white", "pct_black")
dat16 <- congress115 %>%
  mutate(party = tolower(party),
         year_elected = gsub(" .*", "", year_elected),
         year_elected = gsub(",.*", "", year_elected),
         year_elected = as.integer(year_elected),
         race = case_when(grepl("White", race)~"white",
                          grepl("Black", race)~"black",
                          grepl("Asian", race)~"asian",
                          grepl("Hispanic", race)~"hispanic"),
         poc = case_when(race!="white"~1, T~0),
         black = case_when(race=="black"~1, T~0),
         asian = case_when(race=="asian"~1, T~0),
         hispanic = case_when(race=="hispanic"~1, T~0),
         year = 2016,
         gender = case_when(gender=="Man"~"male",
                            gender=="Woman"~"female")) %>%
  mutate(pct_white = as.numeric(pct_white) / 100,
         pct_black = as.numeric(pct_black) / 100,
         dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
         district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district)) %>%
  select(year, district, black, pct_black, pct_white, party, dem_vote_share)

# 2018 ----
congress116 <- read_excel("data/raw/legislators/116th Congress Members Guide with Elections and Demographic Data by District.xlsx", sheet = 1) %>%
  select(c("Code", "Party", "Gender", "Race/ Ethnicity","First Elected",  "2016 President...15", "...16", "2014-2018 ACS Citizen Adult Population", "...30"))
congress116 <- congress116[3:nrow(congress116),]
colnames(congress116) <- c("district", "party", "gender", "race", "year_elected", "dem_vote_share", "rep_vote_share", "pct_white", "pct_black")
dat18 <- congress116 %>%
  mutate(party = tolower(party),
         year_elected = gsub(" .*", "", year_elected),
         year_elected = gsub(",.*", "", year_elected),
         year_elected = as.integer(year_elected),
         race = case_when(grepl("White", race)~"white",
                          grepl("Black", race)~"black",
                          grepl("Asian", race)~"asian",
                          grepl("Hispanic", race)~"hispanic"),
         poc = case_when(race!="white"~1, T~0),
         black = case_when(race=="black"~1, T~0),
         asian = case_when(race=="asian"~1, T~0),
         hispanic = case_when(race=="hispanic"~1, T~0),
         year = 2018,
         gender = case_when(gender=="Man"~"male",
                            gender=="Woman"~"female")) %>%
  mutate(pct_white = as.numeric(pct_white) / 100,
         pct_black = as.numeric(pct_black) / 100,
         dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
         district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district)) %>%
  select(year, district, black, pct_black, pct_white, party, dem_vote_share)

# 2020 ----
congress117 <- read_excel("data/raw/legislators/117th Congress Members Guide with Election Results and Demographic Data by District.xlsx", sheet = 1) %>%
  select(c("Code", "Party", "Gender", "Race/ Ethnicity", "First Elected", "2020 President...15", "...16", "2015-2019 ACS Citizen Adult Population", "...34"))
congress117 <- congress117[3:nrow(congress117),]
colnames(congress117) <- c("district", "party", "gender", "race", "year_elected", "dem_vote_share", "rep_vote_share", "pct_white", "pct_black")
dat20 <- congress117 %>%
  mutate(party = tolower(party),
         year_elected = gsub(" .*", "", year_elected),
         year_elected = gsub(",.*", "", year_elected),
         year_elected = as.integer(year_elected),
         race = case_when(grepl("White", race)~"white",
                          grepl("Black", race)~"black",
                          grepl("Asian", race)~"asian",
                          grepl("Hispanic", race)~"hispanic"),
         poc = case_when(race!="white"~1, T~0),
         black = case_when(race=="black"~1, T~0),
         asian = case_when(race=="asian"~1, T~0),
         hispanic = case_when(race=="hispanic"~1, T~0),
         year = 2020,
         gender = case_when(gender=="Man"~"male",
                            gender=="Woman"~"female")) %>%
  rowwise() %>%
  mutate(pct_white = as.numeric(pct_white) / 100,
         pct_black = as.numeric(pct_black) / 100,
         dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
         district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district)) %>%
  select(year, district, black, pct_black, pct_white, party, dem_vote_share)

# 2022 ----
congress118 <- read_excel("data/raw/legislators/118th Congress Members Guide with Election Results and Demographic Data by District.xlsx", sheet = 1) %>%
  select(c("Code", "Incumbent Party", "Gender", "Race or Ethnicity", "Year First Elected", "2020 President...15", "...16", "Citizen Voting Age Population by Race (2016-2020 ACS)", "...34"))
congress118 <- congress118[3:nrow(congress118),]
colnames(congress118) <- c("district", "party", "gender", "race", "year_elected", "dem_vote_share", "rep_vote_share", "pct_white", "pct_black")
dat22 <- congress118 %>%
  mutate(party = tolower(party),
         year_elected = gsub(" .*", "", year_elected),
         year_elected = gsub(",.*", "", year_elected),
         year_elected = as.integer(year_elected),
         race = case_when(grepl("White", race)~"white",
                          grepl("Black", race)~"black",
                          grepl("Asian", race)~"asian",
                          grepl("Hispanic", race)~"hispanic"),
         poc = case_when(race!="white"~1, T~0),
         black = case_when(race=="black"~1, T~0),
         asian = case_when(race=="asian"~1, T~0),
         hispanic = case_when(race=="hispanic"~1, T~0),
         year = 2022,
         gender = case_when(gender=="Man"~"male",
                            gender=="Woman"~"female")) %>%
  rowwise() %>%
  mutate(pct_white = as.numeric(pct_white) / 100,
         pct_black = as.numeric(pct_black) / 100,
         dem_vote_share = as.numeric(dem_vote_share) / 100, rep_vote_share = as.numeric(rep_vote_share) / 100,
         district = case_when(grepl("-AL", district)~stringr::str_replace(district, "-AL", "-01"), T~district)) %>%
  select(year, district, black, pct_black, pct_white, party, dem_vote_share)

# join years ----
dat <- rbind(dat10, dat12, dat14, dat16, dat18, dat20, dat22) %>%
  mutate(type = case_when(pct_white > 0.5 ~ "Majority white",
                          pct_black > 0.5 ~ "Majority Black",
                          T~"Majority-minority"),
         party = case_when(grepl("democ|independent", party)~"Democrat",
                           grepl("repub|libertarian", party)~"Republican",
                           T~"Vacant"),
         yearnum = (year - 2010) / 2)

write.csv(dat, "data/clean/congress.csv")

# SECTION 2 ----

# data for section 2 are the property of the respective authors who conducted the original studies included in this meta-analysis.
# see 0_meta_analysis_figures.R for the code used to produce the figures in this section.

# data for SECTION 3 ----

targets <- read.csv("data/raw/genderageregionweights.csv") %>% select(-X)

attitudesfig <- read.csv("data/raw/study1.csv") %>%
  filter(race=="Black") %>%
  mutate(chosen_candidate = as.numeric(won),
         age = rescale(age, to = c(18, 83)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = tolower(region),
         gender = case_when(female==0~1, female==1~2)) %>%
  dplyr::select(chosen_candidate, rr_index, disc_black, age_group, region, gender) %>%
  pivot_longer(c(disc_black, rr_index, ), names_to = "outcome", values_to = "value") %>%
  mutate(outcome = case_when(outcome=="disc_black"~"Perceived anti-Black discrimination",
                             outcome=="policy_payreps"~"Support for reparations",
                             T~"Racial resentment"),
         outcome = factor(outcome, levels = c("Perceived anti-Black discrimination", "Racial resentment", "Support for reparations"), ordered=T),
         weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup() %>%
  group_by(outcome, value) %>%
  mutate(groupavg = weighted.mean(chosen_candidate, weights = weight, na.rm = T),
         weighted_n = sum(weight)) %>%
  ungroup()

attitudesfig2 <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  filter(race=="Black") %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  pivot_longer(c(therm_trump, therm_biden), names_to = "outcome", values_to = "value") %>%
  mutate(outcome = case_when(grepl("trump", outcome)~"Trump feeling thermometer",
                             T~"Biden feeling thermometer"),
         value = round(value, 1)) %>%
  select(c(age_group, region, gender, outcome, value, chosen_candidate)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup() %>%
  group_by(outcome, value) %>%
  mutate(groupavg = weighted.mean(chosen_candidate, weights = weight, na.rm = T),
         weighted_n = sum(weight)) %>%
  ungroup()

attitudesfig3 <- rbind(attitudesfig, attitudesfig2)


write.csv(attitudesfig3, "data/clean/attitudesfigdat.csv")

reparations1 <- read.csv("data/raw/study1.csv") %>%
  mutate(placement = rescale(placement, to = c(1,0)),
         chosen_candidate = won,
         sample = 1, 
         X = paste(respondent, sample), 
         candselfplacement = NA_character_, 
         canddistance = NA_real_, 
         realdirdiff = NA_real_,
         age = rescale(age, to = c(18, 83)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = tolower(region),
         gender = case_when(female==0~1, female==1~2)) %>% 
  dplyr::select(race, chosen_candidate, X, reparations, policy_payreps, sample, age_group, region, gender)

reparations2 <-
  csvy::read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>% 
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  filter(pid %in% c(5, 6, 7)&treat=="Policy") %>% 
  mutate(
    sample = 2,
    X = paste(X, sample),
    placement = NA_real_) %>% 
  rename(policy_payreps = policy_reparations) %>% 
  dplyr::select(race, chosen_candidate, X, reparations, policy_payreps, sample, age_group, region, gender)


reparations <- rbind(reparations1, reparations2) %>%
  mutate(reparations = factor(reparations),
         race = factor(race)) %>%
  group_by(X) %>%
  mutate(nrow = row_number(),
         opponent_reparations = case_when(nrow==1~lead(reparations),T~lag(reparations)),
         black_stance = ifelse(race=="Black", tolower(as.character(reparations)), NA_character_),
         white_stance = ifelse(race=="White", tolower(as.character(reparations)), NA_character_)) %>%
  fill(black_stance, .direction = "updown") %>%
  fill(white_stance, .direction = "updown") %>%
  ungroup() %>%
  mutate(stances = case_when(white_stance==black_stance~paste("Both candidates", black_stance, sep = " "),
                             race=="Black"~paste("This candidate ", black_stance, "s\nopponent ", white_stance, "s", sep = ""),
                             race=="White"~paste("This candidate ", white_stance, "s\nopponent ", black_stance, "s", sep = "")),
         policy_payreps = case_when(policy_payreps==0.5~1,T~policy_payreps),
         policy_payreps = case_when(policy_payreps==0~"Respondent opposes reparations",
                                    policy_payreps==1~"Respondent supports reparations\nor doesn't know"),
         race = factor(race, levels = c("White", "Black"), ordered = T)) %>%
  dplyr::select(policy_payreps, race, stances, chosen_candidate, age_group, region, gender) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup()


reps_black <- reparations %>%
  group_by(policy_payreps, stances) %>%
  do(estimate = summary(lm(chosen_candidate ~ as.numeric(race=="Black") + 0, weights = weight, data = .))$coefficients[1,1],
     std.error = summary(lm(chosen_candidate ~ as.numeric(race=="Black") + 0, weights = weight, data = .))$coefficients[1,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         race = "Black")

reps_white <- reparations %>%
  group_by(policy_payreps, stances) %>%
  do(estimate = summary(lm(chosen_candidate ~ as.numeric(race=="White") + 0, weights = weight, data = .))$coefficients[1,1],
     std.error = summary(lm(chosen_candidate ~ as.numeric(race=="White") + 0, weights = weight, data = .))$coefficients[1,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         race = "White")

ns <- reparations %>%
  group_by(policy_payreps, stances, race) %>%
  summarize(n = round(sum(weight), 2))

reps <- rbind(reps_black, reps_white) %>%
  full_join(ns)



ymax <- reps %>%
  group_by(stances, policy_payreps) %>%
  summarize(y.position = max(upper) + 0.05)

repstemp3 <- reparations %>%
  group_by(policy_payreps, stances) %>%
  do(estimate = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,1],
     std.error = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         z = estimate/std.error,
         p = exp(-0.717*z - 0.416*z^2)) %>%
  ungroup() %>%
  mutate(group1 = "White",
         group2 = "Black",
         diff = round(estimate, 3),
         plab = case_when(p < 0.001 ~ " < 0.001",
                          p < 0.01 ~ " < 0.01",
                          p < 0.05 ~ " < 0.05",
                          T~paste(" = ", round(p, 3), sep = "")),
         p.signif = case_when(p < 0.001 ~ "***",
                              p < 0.01 ~ "**",
                              p < 0.05 ~ "*",
                              T~"")) %>%
  full_join(ymax)

repstab <- reps %>%
  dplyr::select(policy_payreps, race, stances, estimate, std.error, n) %>%
  mutate(z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~"")) %>%
  dplyr::select(-c(pval, z)) %>%
  pivot_wider(names_from = race, values_from = c(estimate, std.error, pstars, n)) %>%
  mutate(estimate_black = paste(round(estimate_Black, 3), pstars_Black, " (", round(std.error_Black, 3), ")", sep = ""),
         estimate_white = paste(round(estimate_White, 3), pstars_White, " (", round(std.error_White, 3), ")", sep = "")) %>%
  dplyr::select(policy_payreps, stances, estimate_black, n_Black, estimate_white, n_White) %>%
  arrange(stances, policy_payreps)

repstabdiff <- repstemp3 %>%
  mutate(estimate_diff = paste(round(estimate, 3), p.signif, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(policy_payreps, stances, estimate_diff)

repstab <- full_join(repstab, repstabdiff) %>%
  arrange(desc(policy_payreps)) %>%
  dplyr::select(-c(policy_payreps)) %>%
  mutate(stances = gsub("\\n", ", ", stances),
         stances = case_when(grepl("This candidate opposes", stances)~"Oppose/support",
                             grepl("This candidate supports", stances)~"Support/oppose",
                             grepl("Both candidates oppose", stances)~"Oppose/oppose",
                             grepl("Both candidates support", stances)~"Support/support"))

write.csv(reps, "data/clean/reparationsfig.csv")
write.csv(repstemp3, "data/clean/reparationsfig_pvals.csv")
write.csv(repstab, "data/clean/reparationstab.csv")


dat_ideo1 <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  filter(!ideo%in%c(1,2)) %>% # remove participants who are more conservative than any of the candidates on offer
  rowwise() %>%
  mutate(real_ideo_distance = ideo - candselfplacement_numfull, # positive values -> R more liberal than candidate
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(X) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candselfplacement_numfull)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candselfplacement_numfull = temp2 - candselfplacement_numfull,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candselfplacement_numfull - other_candselfplacement_numfull) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         strength_dem = case_when(pid==7~1,
                                  pid==6~0.5,
                                  pid==5~0,
                                  T~NA_real_)) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, region, gender) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2))

dat_ideo2 <- read.csv("data/raw/ca_omnibus.csv") %>%
  filter(liberal>0.3) %>%
  mutate(candideo_num = case_when(candideo=="Very liberal"~1,
                                  candideo=="Liberal"~5/6,
                                  candideo=="Somewhat liberal"~2/3,
                                  candideo=="Moderate"~0.5,
                                  candideo=="Somewhat conservative"~1/3,
                                  T~NA_real_),
         region = "west",
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<150~"60p"),
         gender = case_when(gender=="Male"~1, gender=="Female"~2)) %>%
  rowwise() %>%
  mutate(real_ideo_distance = liberal - candideo_num,
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(ResponseId) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candideo_num)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candideo_num = temp2 - candideo_num,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candideo_num - other_candideo_num) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         race = candrace,
         X = ResponseId) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, gender, region)


dat_ideo <- rbind(dat_ideo1, dat_ideo2) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup()


mod1 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="Black",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "Black", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mod2 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="White",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "White", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mods <- rbind(mod1, mod2) %>%
  arrange(desc(term)) %>%
  ungroup() %>%
  mutate(z = abs(estimate - 0.5)/std.error,
         diff_from_50 = exp(-0.717*z - 0.416*z^2),
         diff_from_50_stars = case_when(diff_from_50 < 0.001~"***",
                                        diff_from_50 < 0.01~"**",
                                        diff_from_50 < 0.05~"*",
                                        T~""))

ymax <- mods %>%
  group_by(term) %>%
  summarize(y.position = max(conf.high) + 0.05)

modsp <- dat_ideo %>%
  filter(!is.na(rel_ideo_distance)) %>%
  group_by(rel_ideo_distance) %>%
  do(estimate = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,1],
     std.error = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         z = estimate/std.error,
         p = exp(-0.717*abs(z) - 0.416*abs(z)^2)) %>%
  ungroup() %>%
  mutate(group1 = "White",
         group2 = "Black",
         diff = round(estimate, 3),
         plab = case_when(p < 0.001 ~ " < 0.001",
                          T~paste(" = ", round(p, 3), sep = "")),
         p.signif = case_when(p < 0.001 ~ "***",
                              p < 0.01 ~ "**",
                              p < 0.05 ~ "*",
                              T~ ""),
         term = rel_ideo_distance,
         xmin = term - 0.03,
         xmax = term + 0.03) %>%
  full_join(ymax)


write.csv(dat_ideo, "data/clean/dat_ideo.csv")
write.csv(mods, "data/clean/ideo_mods.csv")
write.csv(modsp, "data/clean/ideo_modsp.csv")


# data for APPENDIX
# appendix A ----


dat <- read.csv("data/raw/SorensenChenPRQ.csv") %>%
  filter(receipts>2000&!is.na(district)) %>%
  mutate(win = case_when(win=="Win"~1,
                         win=="Lost"~0,
                         T~NA_real_),
         white = case_when(white=="White"~1,
                           T~0),
         race = case_when(white==1~"white",
                          black==1~"black",
                          asian==1~"asian",
                          hispanic==1~"hispanic",
                          nativeam==1~"native american",
                          is.na(race)~"other",
                          T~NA_character_))


dat <- dat %>%
  select(c(receipts, cycle, white, black, gender, safep, status, educpct, HHincome, whitepct_nohisp, pctwomen_leg, south, seniority, leadership, comchair,
           experience, indivs, pacsandothercomittees, win, party, id, votepct, statedis, candidate, race))

# get 2020 and 2022 FEC data
dat20 <- read.csv("data/raw/candidate_summary_2020.csv") %>% mutate(year = 2020)
dat22 <- read.csv("data/raw/candidate_summary_2022.csv") %>% mutate(year = 2022)

dat20 <- reduce(list(dat20, dat22), full_join) %>%
  filter(Cand_Office=="H") %>%
  mutate(district = case_when(Cand_Office_Dist < 10~paste(Cand_Office_St, "0", Cand_Office_Dist, sep = ""),
                              T~paste(Cand_Office_St, Cand_Office_Dist, sep = "")),
         district = gsub("00", "01", district),
         surname = gsub(",.*", "", Cand_Name)) %>%
  select(year, Cand_Name, surname, Cand_Id, district, Cand_Party_Affiliation, Cand_Incumbent_Challenger_Open_Seat,
         Total_Receipt, Total_Disbursement, Individual_Contribution, Party_Committee_Contribution, Total_Contribution) %>%
  group_by(Cand_Id) %>%
  mutate(surname = trimws(first(surname)),
         surname = gsub("[[:punct:]]", "", surname),
         Cand_Name = first(Cand_Name),
         surname = case_when(surname=="CHESNUT"~"CHESTNUT",
                             surname=="BRIGHT SR"~"BRIGHT",
                             surname=="MAUPIN JR"~"MAUPIN",
                             surname=="FULLERKENDALL"~"FULLER",
                             Cand_Name=="SHAH, JAYENDRA ARVINDLAL DR"~"SHAH, JAY",
                             Cand_Name=="SHAH, USHA JAYENDRA MRS"~"SHAH, USHA",
                             Cand_Name=="CARTER, EARL LEROY"&year==2014~"CARTER, EARL",
                             Cand_Name=="CARTER, JAMES DARWIN"&year==2014~"CARTER, JAMES DARWIN",
                             surname=="QUANG"~"PHAN",
                             surname=="PALOMARESSTARBUCK"~"STARBUCK",
                             surname=="MCMCANDLESS"~"MCCANDLESS",
                             surname=="RIBIERO"~"RIBEIRO",
                             surname=="DOLD JR"~"DOLD",
                             surname=="WIEZER"~"WEIZER",
                             surname=="HALE JR"~"HALE",
                             surname=="WALORSKI SWIHART"~"WALORSKI",
                             surname=="MAHONEY III"~"MAHONEY",
                             surname=="VAN HAAFTEN"~"VANHAAFTEN",
                             Cand_Name=="MCFARLANE, MICHAEL"~"MACFARLANE",
                             surname=="WEBBEDGINGTON"~"WEBB EDGINGTON",
                             surname=="TISEI"~"TSEI",
                             surname=="TEMPERLEY"~"TEMPERLY",
                             surname=="LA FERLA"~"LAFERLA",
                             surname=="BRIKHO"~"BIRKHO",
                             Cand_Name=="JACOBSEN, KAREN E"~"JACOBSON",
                             surname=="WALHGREN"~"WAHLGREN",
                             surname=="DILAURO"~"DELAURO",
                             surname=="INNARELLI JR"~"INNARELLI",
                             surname=="KELLY JR"~"KELLY",
                             surname=="GRISHAN"~"LUJAN GRISHAM",
                             surname=="CAVANAGH"~"CAVANAUGH",
                             surname=="DI CARLO"~"DICARLO",
                             surname=="BREEN"~"FOCKE BREEN",
                             surname=="COPE"~"CHOPE",
                             surname=="GREEN RONALD NEAL"~"GREEN",
                             surname=="DAYLIN"~"LEACH",
                             surname=="MARGOLIESMEZVINSKY"~"MARGOLIES",
                             surname=="WROBLESKIMULLIS"~"MULLIS",
                             surname=="LEINWEBEER"~"LEINWEBER",
                             surname=="STRADERDEAN"~"DEAN",
                             Cand_Name=="HALL, JERRY RAY"~"RAY HALL",
                             surname=="HATHCOX"~"HATCHCOX",
                             surname=="SEIBERT"~"SIEBERT",
                             surname=="PUENTEBRADSHAW"~"PUENTE BRADSHAW",
                             Cand_Name=="HERRERA, JAIME"~"HERRERA BEUTLER",
                             T~surname),
         district = case_when(year==2012&Cand_Name=="DEWITT, REBECCA"~"AZ07",
                              year==2012&Cand_Name=="SANCHEZ, LINDA"~"CA38",
                              Cand_Name=="LEVENE, ALLAN"&substr(Cand_Id, 3, 6)=="GA11"~"GA11",
                              Cand_Name=="LEVENE, ALLAN"&substr(Cand_Id, 3, 6)=="HI01"~"HI01",
                              Cand_Name=="STUTZMAN, MARLIN A"~"IN03",
                              Cand_Name=="WALLACE, DAVID DRAIN II"~"MD08",
                              Cand_Name=="HUNTER, STEPHEN D"~"MO07",
                              district=="MSNA"~"MS03",
                              surname=="GRIMM"&year==2012~"NY11",
                              Cand_Name=="MURPHY, MARK"&year==2012~"NY11",
                              Cand_Name=="BAUER, ANDRE"~"SC07",
                              Cand_Name=="WONNACOTT, BRIAN"~"UT03",
                              Cand_Name=="ROOP, BRANDON E"~"VA07",
                              Cand_Name=="MARSHALL, ROBERT G"~"VA10",
                              Cand_Name=="ANDERS, GREG"~"WA02",
                              surname=="RAIHALA"~"WI08",
                              T~district)) %>%
  select(year, Cand_Name, surname, Cand_Id, district, Cand_Party_Affiliation, Cand_Incumbent_Challenger_Open_Seat,
         Total_Receipt, Total_Disbursement, Individual_Contribution, Party_Committee_Contribution, Total_Contribution)

dat_kos20 <-
  read.csv(
    "data/raw/2020_dailykos_candidaterace.csv"
  ) %>%
  rename_all(., .funs = tolower) %>%
  select(x2020.winners, code, party, race...ethnicity, gender, x.4, party, race...ethnicity.1, x.6, party.1, race...ethnicity.2, x.8, party.2, gender.1, gender.2) %>%
  rename(district = code,
         party1 = party,
         party2 = party.1,
         party3 = party.2,
         race1 = race...ethnicity,
         race2 = race...ethnicity.1,
         race3 = race...ethnicity.2,
         lname1 = x.4,
         lname2 = x.6,
         lname3 = x.8,
         gender1 = gender,
         gender2 = gender.1,
         gender3 = gender.2,
         winner = x2020.winners)

cand1s <- dat_kos20 %>% select(winner, district, party1, race1, lname1, gender1) %>% rename(race = race1, primcand_party = party1, lnamecand = lname1, gender = gender1) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))
cand2s <- dat_kos20 %>% select(winner, district, party2, race2, lname2, gender2) %>% rename(race = race2, primcand_party = party2, lnamecand = lname2, gender = gender2) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))
cand3s <- dat_kos20 %>% select(winner, district, party3, race3, lname3, gender3) %>% rename(race = race3, primcand_party = party3, lnamecand = lname3, gender = gender3) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))

dat_kos20 <- rbind(cand1s, cand2s, cand3s) %>%
  mutate(year = 2020) %>%
  rowwise() %>%
  mutate(surname = tolower(lnamecand),
         surname = gsub("-", " ", surname),
         race = case_when(grepl("Asian|Aisan|Pacific Islander", race)~"asian",
                          grepl("Black", race)~"black",
                          grepl("White|Iranian", race)~"white",
                          grepl("Hispanic", race)~"hispanic",
                          race!=""~"other",
                          T~NA_character_
         )) %>%
  ungroup() %>%
  select(district, race, surname, year, gender, win) %>%
  filter(district!=""&surname!="")

not_missing_race <- dat_kos20 %>% filter(!is.na(race))
dat_kos20_missingrace <- read.csv("data/raw/2020 primary candidates missing race.csv") %>% select(-X) # hand-coded missing values
dat_kos20 <- rbind(not_missing_race, dat_kos20_missingrace) %>%
  mutate(year = 2020)

dat_kos22 <-
  read.csv(
    "data/raw/2022_dailykos_candidaterace.csv"
  ) %>%
  rename_all(., .funs = tolower) %>%
  select(x2022.winners, code, party, race...ethnicity, gender, x.17, party, race...ethnicity.1, x.19, party.1, race...ethnicity.2, x.21, party.2, gender.1, gender.2) %>%
  rename(district = code,
         party1 = party,
         party2 = party.1,
         party3 = party.2,
         race1 = race...ethnicity,
         race2 = race...ethnicity.1,
         race3 = race...ethnicity.2,
         lname1 = x.17,
         lname2 = x.19,
         lname3 = x.21,
         gender1 = gender,
         gender2 = gender.1,
         gender3 = gender.2,
         winner = x2022.winners)

cand1s <- dat_kos22 %>% select(winner, district, party1, race1, lname1, gender1) %>% rename(race = race1, primcand_party = party1, lnamecand = lname1, gender = gender1) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))
cand2s <- dat_kos22 %>% select(winner, district, party2, race2, lname2, gender2) %>% rename(race = race2, primcand_party = party2, lnamecand = lname2, gender = gender2) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))
cand3s <- dat_kos22 %>% select(winner, district, party3, race3, lname3, gender3) %>% rename(race = race3, primcand_party = party3, lnamecand = lname3, gender = gender3) %>% distinct() %>% rowwise() %>% mutate(win = as.numeric(grepl(lnamecand, winner)))

dat_kos22 <- rbind(cand1s, cand2s, cand3s) %>%
  mutate(year = 2022) %>%
  rowwise() %>%
  mutate(surname = tolower(lnamecand),
         surname = gsub("-", " ", surname),
         race = case_when(grepl("Asian|Aisan|Pacific Islander", race)~"asian",
                          grepl("Black", race)~"black",
                          grepl("White|Iranian", race)~"white",
                          grepl("Hispanic", race)~"hispanic",
                          race!=""~"other",
                          T~NA_character_
         )) %>%
  ungroup() %>%
  select(district, race, surname, year, gender, win) %>%
  filter(district!=""&surname!="")

not_missing_race <- dat_kos22 %>% filter(!is.na(race))
dat_kos22_missingrace <- read.csv("data/raw/2022 primary candidates missing race.csv") %>% select(-X) # hand-coded missing values
dat_kos22 <- rbind(not_missing_race, dat_kos22_missingrace) %>%
  mutate(year = 2022)

race2022 <- reduce(list(dat_kos20, dat_kos22), full_join) %>%
  mutate(district = gsub("-", "", district),
         district = gsub("-AL", "01", district),
         district = gsub("AKAL", "AK01", district),
         district = gsub("DEAL", "DE01", district),
         district = gsub("MTAL", "MT01", district),
         district = gsub("VTAL", "VT01", district),
         district = gsub("NDAL", "ND01", district),
         district = gsub("SDAL", "SD01", district),
         district = gsub("WYAL", "WY01", district),
         surname = gsub("'", "", surname),
         surname = gsub("jr.", "", surname),
         surname = gsub("jr", "", surname),
         surname = gsub("Liegeois", "liegeois", surname),
         surname = trimws(toupper(case_when(surname=="sánchez"~"sanchez",
                                            surname=="luján"~"lujan",
                                            surname=="velázquez"~"velazquez",
                                            surname=="oconner"~"o connor",
                                            surname=="stotts pearson"~"pearson",
                                            surname=="gonzález"~"gonzalez",
                                            surname=="aquilera"~"aguilera",
                                            surname=="beutler"~"herrera beutler",
                                            surname=="hayes hinson"~"hinson",
                                            surname=="lee lake"~"lake",
                                            surname=="brown pelzer"~"pelzer",
                                            surname=="wayne lorch"~"lorch",
                                            surname=="falwell"~"fawell",
                                            surname=="pezullo"~"pezzullo",
                                            surname=="van drew"~"vandrew",
                                            surname=="yates glisson"~"glisson",
                                            surname=="ortiz jones"~"jones",
                                            surname=="anatatmula"~"ANANTATMULA",
                                            surname=="aranoff"~"ARONOFF",
                                            surname=="cruz ezammudeen"~"CRUZ",
                                            surname=="devine iii"~"DEVINE",
                                            surname=="osmeña"~"OSMENA",
                                            surname=="dueñas"~"duenas",
                                            T~surname))),
         surname = gsub("É", "E", surname),
         surname = gsub("Ñ", "N", surname),
         surname = gsub("Ó", "O", surname),
         surname = gsub("Á", "A", surname),
         surname = gsub("Í", "I", surname))

dat20 <- dat20 %>% mutate(surname = case_when(surname=="ARMENDARIZJACKSON"~"ARMENDARIZ JACKSON",
                                              surname=="ARNOLDJONES"~"ARNOLD JONES",
                                              surname=="ASAMOACAESAR"~"ASAMOA CAESER",
                                              Cand_Name=="KENNEDY, RONDA"~"BALDWIN KENNEDY",
                                              surname=="ROCHESTER"~"BLUNT ROCHESTER",
                                              surname=="CAMPANAJJAR"~"CAMPA NAJJAR",
                                              surname=="ROYBALALLARD"~"ROYBAL ALLARD",
                                              surname=="MUCARSELPOWELL"~"MUCARSEL POWELL",
                                              surname=="VAN DREW"~"VANDREW",
                                              surname=="OCASIOCORTEZ"~"OCASIO CORTEZ",
                                              surname=="DIAZBALART"~"DIAZ BALART",
                                              surname=="DEVOLDERSANTOS"~"DEVOLDER SANTOS",
                                              surname=="HARVEYHALL"~"HARVEY HALL",
                                              surname=="ARENHOLZ"~"HINSON",
                                              Cand_Name=="LEE, SHEILA JACKSON"~"JACKSON LEE",
                                              surname=="JEANPIERRE"~"JEAN PIERRE",
                                              surname=="JOHNSONGREEN"~"JOHNSON GREEN",
                                              Cand_Name=="LEE LAKE, JEANNINE L"~"LAKE",
                                              Cand_Name%in%c("TILLMAN, JIMMY LEE", "TILLMAN II, JIMMY LEE")~"LEE TILLMAN II",
                                              surname=="LEGER"~"LEGER FERNANDEZ",
                                              surname=="PAULINA LUNA"~"LUNA",
                                              surname=="MCLEODSKINNER"~"MCLEOD SKINNER",
                                              grepl("CATHY&MCMORRIS", Cand_Name)~"MCMORRIS RODGERS",
                                              surname=="MILLERMEEKS"~"MILLER MEEKS",
                                              surname=="MUKHERJEE"~"RAMIREZ MUKHERJEE",
                                              surname=="SANTIAGOCANO"~"SANTIAGO CANO",
                                              surname=="TIMMONSGOODSON"~"TIMMONS GOODSON",
                                              surname=="LOOR"~"VON LOOR",
                                              surname=="WARNERSTANTON"~"WARNER STANTON",
                                              surname=="HAYS"~"MALLETT HAYS",
                                              surname=="GOENAGATORRES"~"TORRES",
                                              surname=="CHERFILUSMCCORMICK"~"CHERFILUS MCCORMICK",
                                              surname=="BALDERRAMOS ROBINSON"~"ROBINSON",
                                              surname=="DAZAFERNANDEZ"~"DAZA",
                                              surname=="RADAKERSHEAFER"~"RADAKER SHEAFER",
                                              surname=="COLEMAN"&district=="NJ12"~"WATSON COLEMAN",
                                              surname=="BEACHFERRARA"~"BEACH FERRARA",
                                              surname=="RIPPELELTON"~"RIPPEL ELTON",
                                              surname=="HARRISTILL"~"HARRIS TILL",
                                              surname=="TREVNO"~"NUNO",
                                              surname=="GLUESENKAMP PEREZ"~"PEREZ",
                                              surname=="GREY BULL"~"GREYBULL",
                                              surname=="SOSSAPAQUETTE"~"SOSSA PAQUETTE",
                                              surname=="NICK"~"LALOTA",
                                              Cand_Name=="DEVOLDER-SANTOS, GEORGE ANTHONY"~"SANTOS",
                                              surname=="FLISSER"~"LEVITT FLISSER",
                                              surname=="CHAVEZDEREMER"~"CHAVEZ DEREMER",
                                              surname=="DE LA CRUZ HERNANDEZ"~"DE LA CRUZ",
                                              surname=="MCMORRIS"~"MCMORRIS RODGERS",
                                              surname=="PUDLO"~"GIRL",
                                              T~surname),
                          district = case_when(Cand_Name=="WILD, SUSAN"~"PA07",
                                               Cand_Name=="WATSON, EUGENE LACY"&year==2022~"WV01",
                                               surname=="SCHWEIKERT"&year==2022~"AZ01",
                                               surname=="POZZOLO"&year==2022~"AZ07",
                                               Cand_Name=="SMITH, AJA"&year==2022~"CA39",
                                               Cand_Name=="JONES, PAUL IRVING MR"&year==2022~"CA44",
                                               Cand_Name=="WALBERG, TIMOTHY L."&year==2022~"MI05",
                                               Cand_Name=="MCCLAIN, LISA"&year==2022~"MI09",
                                               T~district))
dat20 <- left_join(dat20, race2022) %>%
  mutate(merge = case_when(is.na(Cand_Id)~"right only",
                           is.na(race)~"left only",
                           T~"both"))

dat20 <- dat20 %>% 
  group_by(district, surname) %>%
  fill(race, .direction = "downup") %>%
  ungroup() %>%
  group_by(Cand_Name) %>%
  fill(race, .direction = "downup") %>%
  ungroup() %>%
  group_by(district) %>%
  rowwise() %>%
  mutate(pacsandothercomittees = Total_Receipt - Individual_Contribution) %>%
  ungroup() %>%
  mutate(race = case_when(Cand_Name%in%c("MOONEY, ALEXANDER XAVIER", "SIRES, ALBIO", 
                                         "NAPOLITANO, GRACE", "ROYBAL-ALLARD, LUCILLE",
                                         "VELA, FILEMON MR.", "SHALALA, DONNA",
                                         "NUNES, DEVIN G")~"hispanic",
                          Cand_Name%in%c("TLAIB, RASHIDA", "ESHOO, ANNA")~"asian",
                          Cand_Name%in%c("MULLIN, MARKWAYNE", "COLE, TOM")~"native american",
                          T~race)) %>%
  rename(receipts = Total_Receipt,
         cycle = year,
         indivs = Individual_Contribution,
         candidate = Cand_Name,
         party = Cand_Party_Affiliation,
         status = Cand_Incumbent_Challenger_Open_Seat,
         statedis = district) %>%
  select(receipts, cycle, indivs, pacsandothercomittees, candidate, party, gender, status, statedis, win, race) %>%
  mutate(black = case_when(race=="black"~1,
                           T~0),
         white = case_when(race=="white"~1,
                           T~0),
         party = case_when(party=="DEM"~"Democratic",
                           party=="REP"~"Republican",
                           T~NA_character_),
         status = str_to_title(status),
         statedis = case_when(substr(statedis, 3,3)=="0"~gsub("0", "", statedis),
                              T~statedis))

# add in covariates

safe20 <- c("AZ2", "CA7", "CA10", "CA45", "CO6", "FL7", "FL13", "FL27", "IL6",
            "IL14", "IL17", "KS3", "ME2", "MI8", "MI11", "MN2", "MN3", "MN5",
            "NE4", "NH2", "NJ3", "NJ5", "NJ11", "NY18", "NY19", "NC1", "OH13",
            "PA7", "PA8", "PA17", "TX32", "VA10", "WA8", "WI3",
            "CA1", "CA4", "CA8", "CA22", "CA42", "IA4", "KS2", "KY6", "MI7",
            "MN8", "NY21", "NY27", "NC9", "OH12", "PA16", "SC2", "TX1", "TX2",
            "TX27", "TX31", "VA1")

safe22 <- c("CA6", "CA21", "CA25", "CT2", "CT3", "FL9", "FL14", "FL22", "FL23",
            "GA2", "IL8", "IL11", "MD2", "MD3", "MA9", "MI11", "MN3", "NJ1", 
            "NJ5", "NJ9", "NJ11", "NM1", "NM3", "NY20", "NY26", "NC6", "NC14",
            "PA6", "WA6", "WA10",
            "CA23", "CA41", "CO3", "CO5", "FL2", "FL16", "FL28", "GA12", "MD1",
            "MI4", "MN1", "MN8", "MO2", "NE1", "NV2", "NJ2", "NC7", "NC9", "NC11",
            "OH7", "OH10", "OH15", "PA1", "PA10", "SC1", "TX23", "VA1", "VA5", "WA5", "WI1")


dat20 <- dat20 %>% mutate(safep = case_when(cycle==2020~as.numeric(statedis%in%safe20),
                                            cycle==2022~as.numeric(statedis%in%safe22)),
                          comchair = case_when(cycle==2020&candidate=="SCOTT, ROBERT C."~1,
                                               T~0),
                          leadership = case_when(cycle==2020&candidate=="LEE, BARBARA"~1,
                                                 cycle==2020&candidate=="WASSERMAN SCHULTZ, DEBBIE"~1,
                                                 cycle==2020&candidate=="FERGUSON, ANDERSON DREW IV"~1,
                                                 cycle==2022&candidate=="JEFFRIES, HAKEEM"~1,
                                                 cycle==2022&candidate=="CLARK, KATHERINE"~1,
                                                 cycle==2022&candidate=="AGUILAR, PETE"~1,
                                                 cycle==2022&candidate=="CLYBURN, JAMES E"~1,
                                                 cycle==2022&candidate=="LIEU, TED"~1,
                                                 T~0),
                          south = case_when(grepl("AL|AR|FL|GA|LA|MS|NC|SC|TN|TX|VA", statedis)~1,
                                            T~0))

max_id <- max(dat$id, na.rm = T)

demos20 <- get_acs(geography = "congressional district",
                   variables = c(educpop = "B06009_001",
                                 educ_bach = "B06009_005",
                                 educ_grad = "B06009_006",
                                 HHincome = "B22008_001",
                                 whitepop = "B01001H_001",
                                 totpop = "B01001_001"),
                   year = 2019,
                   survey = "acs1") %>%
  filter(!grepl("Puerto Rico", NAME)) %>%
  mutate(NAME = gsub("Congressional District \\(", "", NAME),
         NAME = gsub("\\(.*\\)", "", NAME),
         NAME = gsub("at Large", "1", NAME),
         NAME = gsub("Delegate District", "1", NAME),
         distcd = parse_number(NAME),
         state = gsub(".*, ", "", NAME),
         stcd = state.abb[match(state, state.name)],
         statedis = (paste(stcd, distcd, sep = ""))) %>%
  select(-moe) %>%
  spread(key = variable, value = estimate) %>%
  mutate(cycle = 2020) %>%
  rowwise() %>%
  mutate(educpct = (educ_bach + educ_grad) / educpop,
         whitepct_nohisp = (whitepop / totpop) * 100) %>%
  select(cycle, educpct, whitepct_nohisp, HHincome, statedis)

demos22 <- get_acs(geography = "congressional district",
                   variables = c(educpop = "B06009_001",
                                 educ_bach = "B06009_005",
                                 educ_grad = "B06009_006",
                                 HHincome = "B22008_001",
                                 whitepop = "B01001H_001",
                                 totpop = "B01001_001"),
                   year = 2021,
                   survey = "acs1") %>%
  filter(!grepl("Puerto Rico", NAME)) %>%
  mutate(NAME = gsub("Congressional District \\(", "", NAME),
         NAME = gsub("\\(.*\\)", "", NAME),
         NAME = gsub("at Large", "1", NAME),
         NAME = gsub("Delegate District", "1", NAME),
         distcd = parse_number(NAME),
         state = gsub(".*, ", "", NAME),
         stcd = state.abb[match(state, state.name)],
         statedis = (paste(stcd, distcd, sep = ""))) %>%
  select(-moe) %>%
  spread(key = variable, value = estimate) %>%
  mutate(cycle = 2022) %>%
  rowwise() %>%
  mutate(educpct = (educ_bach + educ_grad) / educpop,
         whitepct_nohisp = (whitepop / totpop) * 100) %>%
  select(cycle, educpct, whitepct_nohisp, HHincome, statedis)

demos <- full_join(demos20, demos22)

dat20 <- left_join(dat20, demos)

dat <- full_join(dat, dat20) %>%
  mutate(candidate = case_when(grepl("ADERHOLT, ROBERT", candidate)~"ADERHOLT, ROBERT",
                               grepl("AMODEI, MARK EUGENE", candidate)~"AMODEI, MARK EUGENE",
                               grepl("BARR, GARLAND", candidate)~"BARR, GARLAND ANDY",
                               grepl("BEYER, DONALD STERNOFF", candidate)~"BEYER, DONALD STERNOFF",
                               candidate=="BONAMICI, SUZANNE MS."~"BONAMICI, SUZANNE",
                               candidate=="BUCSHON, LARRY D."~"BUCSHON, LARRY D",
                               grepl("BUDD, THEODORE P", candidate)~"BUDD, THEODORE P",
                               grepl("CALVERT, KEN", candidate)~"CALVERT, KEN",
                               grepl("CARTWRIGHT, MATT", candidate)~"CARTWRIGHT, MATT",
                               candidate=="CARTER, JOHN R. REP."~"CARTER, JOHN R.",
                               grepl("CLARKE, YVETTE", candidate)~"CLARKE, YVETTE",
                               candidate=="CLYBURN, JAMES E."~"CLYBURN, JAMES E",
                               grepl("CONNOLLY, GER", candidate)~"CONNOLLY, GERALD",
                               candidate=="DEFAZIO, PETER A."~"DEFAZIO, PETER A",
                               grepl("DESJARLAIS, SCOTT", candidate)~"DESJARLAIS, SCOTT",
                               candidate=="BACON, DONALD J"~"BACON, DONALD",
                               candidate=="BIGGS, ANDY MR."~"BIGGS, ANDY",
                               candidate=="BILIRAKIS, GUS M"~"BILIRAKIS, GUS MICHAEL",
                               candidate=="BISHOP, SANFORD D JR."~"BISHOP, SANFORD D",
                               candidate=="BISHOP, SANFORD D JR"~"BISHOP, SANFORD D",
                               candidate=="BOST, MICHAEL J"~"BOST, MICHAEL",
                               candidate=="BROWN, ANTHONY GREGORY"~"BROWN, ANTHONY",
                               candidate=="BUTTERFIELD, G. K."~"BUTTERFIELD, G K",
                               grepl("CASE, EDWARD", candidate)~"CASE, EDWARD",
                               candidate=="CLEAVER, EMANUEL II"~"CLEAVER II, EMANUEL",
                               candidate=="DAVIS, DANNY K. MR."~"DAVIS, DANNY K",
                               candidate=="DOGGETT, LLOYD REP."~"DOGGETT, LLOYD",
                               candidate=="DOYLE, MICHAEL"~"DOYLE, MIKE",
                               candidate=="DUNCAN, JEFFREY D MR."~"DUNCAN, JEFF",
                               candidate=="EDWARDS, DONNA FERN"~"EDWARDS, DONNA",
                               candidate=="ELLIOTT, JOYCE ANN SENATOR"~"ELLIOTT, JOYCE ANN",
                               candidate=="EMMER, THOMAS EARL JR."~"EMMER, THOMAS EARL JR",
                               grepl("ENGEL, ELIOT", candidate)~"ENGEL, ELIOT",
                               grepl("FORTENBERRY, JEFF", candidate)~"FORTENBERRY, JEFF",
                               candidate=="FOXX, VIRGINIA ANN"~"FOXX, VIRGINIA",
                               grepl("GIBBS", candidate)~"GIBBS, ROBERT",
                               candidate=="GOHMERT, LOUIS B. MR. JR."~"GOHMERT, LOUIE",
                               grepl("GOSAR, PAUL", candidate)~"GOSAR, PAUL",
                               candidate=="GRAVES, SAMUEL B JR 'SAM'"~"GRAVES, SAMUEL B \"SAM\"",
                               candidate=="GROTHMAN, GLENN S."~"GROTHMAN, GLENN S",
                               grepl("GUTHRIE, S", candidate)~"GUTHRIE, S. BRETT",
                               candidate=="HAMLIN, ROBYN L"~"HAMLIN, ROBYN LYNN",
                               grepl("HASTINGS, ALCEE", candidate)~"HASTINGS, ALCEE",
                               candidate=="HICE, JODY B MR"~"HICE, JODY",
                               candidate=="HIGGINS, CLAY CAPTAIN"~"HIGGINS, CAPTAIN CLAY",
                               candidate=="HOYER, STENY HAMILTON"~"HOYER, STENY",
                               candidate=="JOHNSON, HENRY C 'HANK' JR"~"JOHNSON, HENRY C 'HANK'",
                               candidate=="JOHNSON, HENRY C. 'HANK"~"JOHNSON, HENRY C 'HANK'",
                               grepl("KAPTUR, MARCY", candidate)~"KAPTUR, MARCY",
                               candidate=="KELLY, ROBIN L."~"KELLY, ROBIN",
                               grepl("KELLY, JOHN TRENT", candidate)~"KELLY, JOHN TRENT",
                               candidate=="KILDEE, DANIEL T."~"KILDEE, DANIEL T",
                               candidate=="KIND, RONALD JAMES"~"KIND, RON",
                               grepl("LANGEVIN, JAMES R", candidate)~"LANGEVIN, JAMES R",
                               candidate=="LARSEN, RICHARD RAY"~"LARSEN, RICK",
                               candidate=="LAWRENCE, BRENDA LUIENAR"~"LAWRENCE, BRENDA LULENAR",
                               grepl("LAWSON, ALFRED", candidate)~"LAWSON, ALFRED JR",
                               grepl("LIPINSKI, DANIEL", candidate)~"LIPINSKI, DANIEL",
                               candidate=="LONG, BILLY MR."~"LONG, BILLY",
                               grepl("LYNCH, STEPHEN", candidate)~"LYNCH, STEPHEN",
                               grepl("MALONEY, CAROLYN", candidate)~"MALONEY, CAROLYN",
                               candidate=="MCEACHIN, ASTON DONALD MR."~"MCEACHIN, ASTON DONALD",
                               grepl("MCGOVERN, JAMES", candidate)~"MCGOVERN, JIM",
                               candidate=="MCHENRY, PATRICK TIMOTHY"~"MCHENRY, PATRICK",
                               grepl("MCKINLEY, DAVID B", candidate)~"MCKINLEY, DAVID B",
                               candidate=="MCNERNEY, GERALD MARK"~"MCNERNEY, JERRY",
                               candidate=="MOOLENAAR, JOHN MR."~"MOOLENAAR, JOHN",
                               candidate=="MOORE, GWEN S"~"MOORE, GWENDOLYNNE",
                               candidate=="MUELLER, STEVE C"~"MUELLER, STEVE",
                               candidate=="NADLER, JERROLD L. MR."~"NADLER, JERROLD L.",
                               grepl("NEWHOUSE, DAN", candidate)~"NEWHOUSE, DAN",
                               candidate=="OLSON, PETER G."~"OLSON, PETER G",
                               grepl("PASCRELL", candidate)~"PASCRELL, WILLIAM J",
                               candidate=="PAYNE, DONALD M., JR."~"PAYNE, DONALD M",
                               grepl("PRICE, DAVID", candidate)~"PRICE, DAVID",
                               grepl("PETERSON, COLLIN", candidate)~"PETERSON, COLLIN",
                               grepl("RATCLIFFE, JOHN L", candidate)~"RATCLIFFE, JOHN L",
                               grepl("ROUZER, DAVID CHESTON", candidate)~"ROUZER, DAVID CHESTON",
                               grepl("REED, THOMAS W", candidate)~"REED, THOMAS W",
                               candidate=="RICHMOND, CEDRIC L."~"RICHMOND, CEDRIC L",
                               candidate=="ROCHESTER, LISA BLUNT"~"BLUNT ROCHESTER, LISA",
                               candidate=="ROGERS, LAKESHA D"~"ROGERS, LAKESHA",
                               candidate=="ROGERS, MICHAEL DENNIS"~"ROGERS, MICHAEL",
                               grepl("RUPPERSBERGER", candidate)~"RUPPERSBERGER, DUTCH",
                               candidate=="RUSH, BOBBY LEE"~"RUSH, BOBBY L",
                               grepl("RYAN, TIMOTHY", candidate)~"RYAN, TIMOTHY",
                               grepl("SCALISE, STEVE", candidate)~"SCALISE, STEVE",
                               grepl("SCOTT, ROBERT", candidate)~"SCOTT, ROBERT",
                               candidate=="SEBRING, DANIEL PAUL"~"SEBRING, DANIEL P",
                               candidate=="SEIBERT, SEAN ERIC LEE"~"SEIBERT, SEAN",
                               candidate=="SEWELL, TERRYCINA ANDREA"~"SEWELL, TERRI A.",
                               candidate=="SIMPSON, MICHAEL K"~"SIMPSON, MICHAEL",
                               candidate=="SMITH, CHRISTOPHER H."~"SMITH, CHRISTOPHER H",
                               candidate=="SMITH, D. ADAM"~"SMITH, D ADAM",
                               candidate=="SMUCKER, LLOYD K."~"SMUCKER, LLOYD K",
                               candidate=="STOCKHAM, CHARLES (CASPER) WESLEY"~"STOCKHAM, CASPER WESLEY",
                               candidate=="STOKHAM, CHARLES WESLEY (CASPER)"~"STOCKHAM, CASPER WESLEY",
                               grepl("THOMPSON, GLENN", candidate)~"THOMPSON, GLENN",
                               grepl("TIPTON, SCOTT", candidate)~"TIPTON, SCOTT",
                               grepl("TURNER, MICHAEL", candidate)~"TURNER, MICHAEL",
                               candidate=="VEASEY, MARC ALLISON MR."~"VEASEY, MARC ALLISON",
                               candidate=="WAGNER, ANN L."~"WAGNER, ANN L",
                               grepl("WALBERG, TIMOTHY", candidate)~"WALBERG, TIMOTHY",
                               candidate=="WALORSKI (SWIHART), JACKIE"~"WALORSKI SWIHART, JACKIE",
                               candidate=="WATT, MELVIN L"~"WATT, MEL",
                               candidate=="WEST, ALLEN B MR."~"WEST, ALLEN B",
                               candidate=="WESTERMAN, BRUCE MR."~"WESTERMAN, BRUCE",
                               candidate=="WILSON, FREDERICA S."~"WILSON, FREDERICA S",
                               candidate=="WITTMAN, ROBERT J MR."~"WITTMAN, ROBERT J.",
                               candidate=="YARMUTH, JOHN A MR"~"YARMUTH, JOHN A",
                               candidate=="MULLIN, MARKWAYNE MR."~"MULLIN, MARKWAYNE",
                               candidate=="CORREA, JOSE LUIS (LOU) MR."~"CORREA, JOSE",
                               grepl("ESHOO", candidate)~"ESHOO, ANNA",
                               candidate=="VELA, FILEMON MR."~"VELA, FILEMON",
                               candidate=="VARGAS, JUAN C."~"VARGAS, JUAN CARLOS",
                               grepl("RUIZ, RAUL", candidate)~"RUIZ, RAUL",
                               grepl("CASTRO", candidate)~"CASTRO, JOAQUIN",
                               grepl("GONZALEZ, VICENTE", candidate)~"GONZALEZ, VICENTE",
                               T~candidate)) %>%
  group_by(candidate) %>%
  fill(id, gender) %>% # carry forward id variable for candidates who are running again in 2020
  mutate(candnum = cur_group_id(),
         id = case_when(!is.na(id)~id,
                        T~max_id + candnum),
         seniority = ifelse(status!="Incumbent", 0, seniority),
         gender = case_when(candidate=="MCDOWELL, JAN"~"Woman",
                            candidate=="WATKINS, STEVE"~"Man",
                            candidate=="MCCREADY, DANIEL"~"Man",
                            candidate=="GAITHER, KEVIN"~"Man",
                            candidate=="WILLIAMS, WILLIAM HENRY"~"Man",
                            candidate=="RING, LISA M"~"Woman",
                            candidate=="ENGEL, ELIOT L. REP."~"Man",
                            candidate=="ROBINSON, EMILY"~"Woman",
                            candidate=="RIGGLEMAN, DENVER LEE MR. III"~"Man",
                            candidate=="BAUGH, SCOTT"~"Man",
                            candidate=="CLARK, JOHN"~"Man",
                            candidate=="PHILLIPS, GEORGE KARL MR"~"Man",
                            candidate=="BARKLEY, MICHAEL JAMES"~"Man",
                            candidate=="PETERS, SAMUEL JAMES MR"~"Man",
                            candidate=="HILL, DONALD"~"Man",
                            candidate=="SWANK, KEITH ROLAND"~"Man",
                            candidate=="LYRAS, LOUIS GEORGE"~"Man",
                            candidate=="BROUN, PAUL"~"Man",
                            candidate=="GONSALVES, MARK"~"Man",
                            candidate=="GRAYSON, DENA MD, PHD"~"Woman",
                            candidate=="JONES, LAURA"~"Woman",
                            candidate=="LETLOW, LUKE JOSHUA"~"Man",
                            candidate=="WESTON, JOHN KAEL MR"~"Man",
                            candidate=="CONOLE, FRANCIS"~"Man",
                            candidate=="ROUSE, ERIC S"~"Man",
                            candidate=="CHERFILUS-MCCORMICK, SHEILA"~"Woman",
                            candidate=="OLIVO, CHRISTINE ALEXANDRIA"~"Woman",
                            candidate=="FLEMING, BRIDGET"~"Woman",
                            candidate=="FOX, DR. RICHARD B"~"Man",
                            candidate=="HOLLOWAY, LASHONDA J"~"Woman",
                            candidate=="CUMMINGS, MAYA ROCKEYMOORE"~"Woman",
                            candidate=="THORPE, DEVIN D"~"Man",
                            candidate=="ASHFORD, ANN"~"Woman",
                            candidate=="PUDLO, G MAEBE A. GIRL"~"Woman",
                            candidate=="YOUNG, GEOFFREY M"~"Man",
                            candidate=="EDWARDS, DONNA F"~"Woman",
                            candidate=="ROSS, DENNIS"~"Man",
                            candidate=="FLINN, GEORGE S JR"~"Man",
                            candidate=="WILLIAMS, DAVID"~"Man",
                            candidate=="ARRINGTON, KATIE"~"Woman",
                            candidate=="MARTIN, BRANDON MR."~"Man",
                            candidate=="YANEZ, HENRY"~"Man",
                            candidate=="WALLACE, DAVID DRAIN II"~"Man",
                            candidate=="CONYERS, JOHN"~"Man",
                            candidate=="BROUN, PAUL"~"Man",
                            candidate=="O'CONNOR, DANIEL JAY JR."~"Man",
                            candidate=="WRIGHT, SUSAN"~"Woman",
                            T~gender),
         gender = case_when(gender=="Man"~0,
                            gender=="Woman"~1,
                            T~NA_real_),
         open = case_when(status=="Open"~1, T~0),
         incumbent = case_when(status=="Incumbent"~1, T~0),
         status = case_when(candidate=="GRIGGS, JOYCE MARIE"~"Challenger", T~status),
         experience = case_when(incumbent==1~"Held Elected Office", T~experience)) %>%
  rowwise() %>% 
  mutate(majminority = as.numeric(whitepct_nohisp < .5))

dat <- dat %>%
  mutate(seniority = case_when(cycle==2020&candidate%in%c("BISHOP, JAMES DANIEL", 
                                                          "MURPHY, GREGORY FRANCIS DR",
                                                          "TRONE, DAVID",
                                                          "JOYCE, JOHN",
                                                          "WILD, SUSAN",
                                                          "STEUBE, W. GREG",
                                                          "MCBATH, LUCIA KAY MS.",
                                                          "TRAHAN, LORI",
                                                          "TORRES SMALL, XOCHITL")~2,
                               cycle==2020&candidate%in%c("TIFFANY, TOM", 	
                                                          "JACOBS, CHRISTOPHER L.",
                                                          "MFUME, KWEISI",
                                                          "KELLER, FRED")~1,
                               cycle==2020&candidate%in%c("COSTA, JIM MR.")~8,
                               cycle==2020&candidate%in%c("PANETTA, JAMES VARNI")~4,
                               cycle==2022&candidate=="ISSA, DARRELL"~2,
                               T~seniority)) %>%
  mutate(seniority18 = ifelse(cycle==2018&incumbent==1, seniority, NA),
         seniority18 = ifelse(cycle==2018&incumbent==0&win==1, 0, seniority18)) %>%
  group_by(candidate) %>%
  fill(seniority18) %>%
  mutate(seniority = case_when(status!="Incumbent"~ 0,
                               is.na(seniority)&cycle==2020~seniority18 + 2,
                               T~seniority)) %>%
  mutate(seniority20 = ifelse(cycle==2020&incumbent==1, seniority, NA),
         seniority20 = ifelse(cycle==2020&incumbent==0&win==1, 0, seniority20)) %>%
  fill(seniority20) %>%
  mutate(seniority = case_when(status!="Incumbent"~ 0,
                               is.na(seniority)&cycle==2022~seniority20 + 2,
                               T~seniority),
         n = n()) %>%
  ungroup()

# # all incumbents were coded in the original dataset as not having held elected office, so I changed that. the remaining 356 that are missing experience will have to be hand-coded.


# code experience for missing people
nones20 <- c("ACKERMAN, PATRICIA GERALDENE MS.",
             "ADKINS, AMANDA",
             "ALBRO, CATHERINE",
             "ALMONORD, VALBRUN",
             "ANDREWS, ALISCIA",
             "ANTHONY, CHARLES",
             "ASAMOA-CAESAR, KOJO",
             "ASHFORD, ANN",
             "AVERHART, JAMES",
             "BALTER, DANA",
             "BANYAI, CINDY LYN",
             "BARNETT, JOSH",
             "BARNETT, KALI",
             "BARNETTE, KATHY",
             "BARRETT, DANA",
             "BASLER, DOUGLAS MICHAEL",
             "BECCHI, ROSEMARY",
             "BELL, ADRIENNE",
             "BENJAMIN, LEON MR. SR.",
             "BERGHOEF, BRYAN",
             "BERNSTEIN, CATHY",
             "BISH, CHRISTINE",
             "BIZON, KIMBERLY ANN",
             "BOEBERT, LAUREN",
             "BOGNET, JIM",
             "BOROUGHS, ADAIR FORD",
             "BOURDEAUX, CAROLYN",
             "BOWMAN, JAMAAL",
             "BRADLEY, JAMES P.",
             "BRADY, HELEN",
             "BRANNON, MARY",
             "BROWN, DAVID",
             "BROWN, MAURICUS",
             "BRZOZOWSKI, DANI",
             "BUBSER, CHRISTINE",
             "BUCK, GEORGE WILLIAM PHD",
             "BURCH, LORIE LOUISE",
             "BUSH, CORI",
             "CALDWELL, JAMES ANDREW",
             "CAMMACK, KAT",
             "CARGILE, MIKE",
             "CAWTHORN, DAVID MADISON",
             "CASTLE, JAIME",
             "CHAMPION, WENDELL",
             "CLARK, JOHN",
             "CLEVELAND, HOSEA N",
             "CLINE, NATALIE MS.",
             "CLYDE, ANDREW",
             "COHN, ALAN MICHAEL",
             "COLL, GREGORY THOMAS",
             "COLLICK, JOHN WILLIAM MR. JR.",
             "COLLINS, GENEVIEVE D",
             "CONAWAY, ROBERT DEAN",
             "COOKINGHAM, KEVIN",
             "COTTRELL, DANA",
             "CRUZ, ERIN",
             "CUMMINGS, JOHN C.",
             "CUMMINGS, MAYA ROCKEYMOORE",
             "DANIEL, STEPHEN",
             "DAVIS, MORRIS D. COL.",
             "DEBELLO, JAMES",
             "DEEGAN, DONNA",
             "DENNEY, AUDREY",
             "DISANTO, DELINA",
             "EARLY, ERIC",
             "EHR, PHILLIP CHARLES",
             "ELIASON, ANTONIA",
             "ELLIS, KATHRYN GAIL",
             "ELLISON, ALLEN",
             "EMMONS, JOHN MR.",
             "ENOCH, VANESSA L. DR.",
             "ESSHAKI, ERIC",
             "FARLEY, CHELE CHIAVACCI",
             "FAWELL, WILLIAM W",
             "FEEHAN, DANIEL",
             "FERGUSON, DANA",
             "FINELLO, CHRISTINA",
             "FOX, DR. RICHARD B",
             "FRANCOIS, VENNIA V.",
             "FREELAND, JILLIAN",
             "FRIEDENBERG, MARC",
             "FULLER, VIRGINIA",
             "GAITHER, KEVIN",
             "GALVIN, ALYSE",
             "GAYOT, LUTCHI",
             "GEPPERT, KATY",
             "GERSHON, PERRY",
             "GILBERT, HANK",
             "GILES, DAVID VICTOR",
             "GNIBUS, KRISTY",
             "GODFREY, AARON PAUL",
             "GORE, LAVERNE",
             "GORMAN, JEFF",
             "GORMAN, MEG",
             "GOROFF, NANCY",
             "GRAYSON, DENA MD, PHD",
             "GREENE, JOAN",
             "GREENE, MARJORIE TAYLOR MRS.",
             "GRIFFIN, LEE",
             "GRIGGS, JOYCE MARIE",
             "GUILD, THOMAS EUGENE",
             "GUSTAFSON, CLAIRE H",
             "HACKETT, MARY PATRICIA",
             "HAMILTON, TAMIKA",
             "HAMMOND, SARAH",
             "HANSON, WILLIAM MR.",
             "HARBOUR, JOANNA",
             "HARSHBARGER, DIANA",
             "HARVEY-HALL, PHYLLIS",
             "HAYWOOD, JOSEPH LEE",
             "HICKS, JOSH",
             "HILLIARD, JEROME C",
             "HITES, BECKY E",
             "HOGG, JON MARK MR.",
             "HOLDEN, DAVID",
             "HOLLIDAY, LINDSAY DOZIER DR",
             "HOUSE, STEVEN",
             "HOWZE, TED D II",
             "HUFFMAN, SCOTT",
             "HUGHES, DAVID RUSSELL MR.",
             "HUNT, WESLEY",
             "IANNUZZI, CAROL HIGBEE",
             "JACKSON, RONNY LYNN",
             "JENSEN, JESSE",
             "JOHNSON-GREEN, TABITHA",
             "JOHNSON, ELIZABETH (LIZ)",
             "JONES, JEFF",
             "JONES, MONDAIRE",
             "JORDAN, JEFF",
             "JOY, ELIZABETH L",
             "KEITH, PAMELA M.",
             "KELLER, CRAIG",
             "KENNEDY, AMY",
             "KENNEDY, BRYNNE",
             "KENNEDY, JAMES",
             "KENNEDY, RICHARD",
             "KENNEDY, RONDA",
             "KING, ESTHER JOY",
             "KISTNER, TYLER",
             "KLACIK, KIMBERLY",
             "KOBLE, CLINT MATTHEW MR.",
             "KREISELMAIER, ELIZABETH",
             "KUNKEL, CATHERINE",
             "LADJEVARDIAN, SIMA JANDAGHI",
             "LAIB, RICK",
             "LENZI, RAYMOND CARL DR.",
             "LINDERMAN, HANK",
             "LOMBARD, LAURA",
             "LONDRIGAN, BETSY DIRKSEN",
             "LONG, CAROLYN N.",
             "LOOMER, LAURA",
             "LOVVORN, TRACY LYN",
             "MANGONE, KIM",
             "MANNING, KATHY",
             "MARSILI, THOMASINA",
             "MARTIN, BRANDON RAY",
             "MARTIN, HENRY ROBERT",
             "MARX, WILLIAM A MR. III",
             "MASON, MIA",
             "MCARDLE SCHULMAN, MAUREEN B MRS",
             "MCCLAIN, LISA",
             "MCCORKLE, ISAAC IAN",
             "MCCORMICK, RICHARD DEAN DR.",
             "MCCREADY, DANIEL",
             "MCDOWELL, JAN",
             "MEIJER, PETER MR.",
             "MENSING, DALE KENNETH",
             "MILLER, MARY",
             "MITRANO, TRACY",
             "MITRIS, GEORGE MR.",
             "MOORE, BLAKE",
             "MRVAN, FRANK J.",
             "MUERI, HILLARY O'CONNOR",
             "MUSCATO, MICHAEL ARCHANGEL",
             "NALBANDIAN, JOHNNY",
             "NEGRON, LUKE",
             "NEIGHBORS, RICKY ALLEN",
             "NELSON, KIMBERLY",
             "NEWBY, JOEL III",
             "NEWMAN, MARIE",
             "O'MARA, WILLIAM EDWARD DR IV",
             "OLIVER, JULIE LYNN",
             "OLSON, WILLIAM",
             "OWENS, BURGESS",
             "OWENSBY, ALEXANDRA DR.",
             "PALLOTTA, FRANK",
             "PALOMBI, CHRISTOPHER",
             "PALZEWICZ, TOM",
             "PANDY, DEVIN D MR SR",
             "PARKER, DAVID PAYNE",
             "PARNELL, RICHARD SEAN",
             "PATTERSON, ROBERT BUZZ",
             "PEACOCK, JULIA",
             "PEARSON, ERIKA STOTTS",
             "PENNIE, DEMETRICK TRE DR.",
             "PFLUGER, AUGUST LEE II",
             "PHILLIPS, GEORGE KARL MR",
             "PITERMAN, KONSTANTINE NIKKA-SHER",
             "POLACK, ROGER",
             "POTTER, QUENTIN",
             "PREMPEH, WILLIAM SREBOE",
             "PRICE, PHILLIP",
             "PRUDEN, JAMES L.",
             "PRUETT, DASHA",
             "QUALLS, KENDALL",
             "QUICK, LAURA",
             "QUINN, CHRISTINE YVONNE",
             "RAZZOLI, MARK",
             "RICHTER, DAVID",
             "RING, LISA M.",
             "ROBINSON, ARTHUR BROUHARD",
             "ROBINSON, EMILY",
             "RODIMER, DAN",
             "ROSS, GENA L DR",
             "ROWLEY, TODD",
             "RUBANDO, NICK",
             "SAGAN, GREGORY T. MR.",
             "SCHELLER, LISA",
             "SCHMID, STEPHANIE",
             "SCHOLTEN, HILLARY",
             "SCHOLTEN, JAMES D.",
             "SCHRODER, KATE",
             "SCOTT, JOSHUA",
             "SEIKALY, HELANE LULU SAWSAN",
             "SHEARER, ALAINA",
             "SHEDD, TIFFANY",
             "SIEGEL, MICHAEL",
             "SIMMONS, LINDSEY NICOLE",
             "SKARLATOS, ALEK",
             "SMITH, AJA",
             "SMITH, SANDY",
             "SMITH, SUSAN MARIE",
             "SPALDING, CARLA A",
             "SPARTZ, VICTORIA",
             "SPENSER, ALEX",
             "SPICER, LAVERN",
             "STANTON KING, ANGELA",
             "STOCKHAM, CASPER WESLEY",
             "STREICKER, MARGARET",
             "SULLIVAN, DAVID XAVIER",
             "SWISHER, CARLTON AARON",
             "TEAGUE, JOHNNY MARK DR.",
             "THERON, DANIEL PETER",
             "THORNTON, DIERDRE",
             "THORPE, DEVIN D",
             "TIMMONS-GOODSON, PATRICIA",
             "TIMS, DESIREE",
             "TODD, IAN ANDREW",
             "TRUNDLE, RYAN",
             "TUMAN, DOUGLAS",
             "VAN AUSDAL, KEVIN",
             "VAN DE WATER, KYLE",
             "VAN ORDEN, DERRICK F. MR.",
             "WALKER, KIMBERLY H MS",
             "WALLACE, CYNTHIA",
             "WALSINGHAM, BLAIR NICOLE",
             "WATSON, MELISSA WARD",
             "WATSON, THOMAS W",
             "WEAVER, ERIKA C",
             "WEBB, BRYANT CAMERON DR.",
             "WEBBER, ERROL MR.",
             "WEBER, ROBERT MARION",
             "WELCH, LISA RENEA DR.",
             "WESTLEY, TIMMY LEE",
             "WESTON, JOHN KAEL MR",
             "WHITTEN, SANDRA LINN MRS.",
             "WILLIAMS, CELESTE SARENE",
             "WILLIAMS, VANGIE",
             "WILLIAMS, WILLIAM HENRY",
             "WINFREY, ADIA MCCLELLAN DR",
             "WRIGHT, KENNETH W DR.",
             "ZAHRADKA, TAWNJA",
             "ZMICH, THOMAS")

heldoffices20 <- c("ARENHOLZ, ASHLEY HINSON",
                   "AUCHINCLOSS, JAKE",
                   "BEELER, JEFFREY ALLEN SR",
                   "BENTIVOLIO, KERRY", # first elected 2012; ran again in 2020
                   "BENTZ, CLIFF",
                   "BICE, STEPHANIE",
                   "BOLZ, KATE",
                   "BRAT, DAVID ALAN",
                   "BRISCOE, JOHN",
                   "CARL, JERRY LEE, JR",
                   "COURSER, AMY RYAN",
                   "CRAFTS, DALE",
                   "DAVIS, JAMES (JIM) WAYLAND DDS, MS",
                   "DAVIS, WENDY",
                   "DE LA ISLA, MICHELLE",
                   "DEPASQUALE, EUGENE",
                   "DOGLIO, BETH",
                   "DONNELLY, TIMOTHY M.",
                   "FAY, MARY",
                   "FEENSTRA, RANDALL",
                   "FISCHBACH, MICHELLE",
                   "FITZGERALD, SCOTT L",
                   "FRANKLIN, SCOTT MR.",
                   "FREITAS, NICK J",
                   "FRICILONE, MIKE",
                   "GARBARINO, ANDREW",
                   "GOOD, MARGARET ELIZABETH ROWELL",
                   "GOOD, ROBERT G.",
                   "GORDON, JACQUELINE",
                   "HAGAN, CHRISTINA",
                   "HALL, JULIE",
                   "HART, RITA",
                   "HOADLEY, JON",
                   "IVES, JEANNE",
                   "JOHNSON, LACY",
                   "KEAN, THOMAS H. JR.",
                   "KING, JESSICA JEANE MS",
                   "LATURNER, JAKE",
                   "MACE, NANCY",
                   "MANN, TRACEY ROBERT",
                   "MARCHANT, JIM",
                   "MARYOTT, BRIAN",
                   "MCLEOD-SKINNER, JAMIE",
                   "MILLER-MEEKS, MARIANNETTE JANE",
                   "MOORE, FELIX BARRY",
                   "NEHLS, TROY",
                   "NYSTROM, QUINN REABE",
                   "OBERNOLTE, JAY",
                   "OBERWEIS, JIM",
                   "PARROTT, NEIL CONRAD MR.",
                   "RATHS, GREGORY GERARD",
                   "ROSENDALE, MATT",
                   "ROSS, DEBORAH",
                   "ROUSE, ERIC S",
                   "RUFF, ANDY",
                   "SALLING, JOHNNY RAY MR.",
                   "SALTER, CAROLYN",
                   "SCHUPP, JILL DARLYNE",
                   "SMITH, CHRISTY",
                   "STUCK, AMANDA",
                   "TAYLOR, SCOTT W",
                   "TORRES, RITCHIE JOHN",
                   "VALADAO, DAVID",
                   "VALENZUELA, CANDACE",
                   "VAN DUYNE, ELIZABETH ANN",
                   "WHITE, PHILANISE",
                   "WILLIAMS, NIKEMA NATASSHA")

heldoffices22 <- c(
  "CROCKETT, JASMINE",
  "JACOBS, SARA",
  "SWARTZ, CRAIG STEPHEN",
  "BOHANNAN, CHRISTINA",
  "BEACH-FERRARA, JASMINE",
  "FRISCH, ADAM",
  "MCGARVEY, MORGAN",
  "HOYLE, VALERIE",
  "MOSKOWITZ, JARED",
  "GUNBY, PATRICIA WASHBURN",
  "NICKEL, WILEY",
  "CARTER, TROY A. SR.",
  "HOULAHAN, STEPHEN WILLIAM",
  "SYKES, EMILIA",
  "PERRY, JAN",
  "CANEPA, DAVID",
  "FLEMING, BRIDGET",
  "JACKSON, JEFF",
  "SMITH, ERICA DANETTE",
  "GILLEN, LAURA",
  "DUPREE, JOHNNY L.",
  "STANSBURY, MELANIE",
  "ENGEL, KIRSTEN",
  "IVEY, GLENN",
  "MULLIN, KEVIN",
  "MARLINGA, CARL",
  "DAVIS, DON",
  "FOUSHEE, VALERIE",
  "LANDSMAN, GREG",
  "GRAY, ADAM C.",
  "PETTERSEN, BRITTANY LOUISE MS.",
  "KAMLAGER, SYDNEY",
  "HARMON, MARK DESMOND DR.",
  "CAMPBELL, HEIDI",
  "MATHIS, ELIZABETH",
  "JONES, HERBERT CARROLL JR",
  "RONNING, PENNY",
  "ROSE, MAX",
  "PANSING BROOKS, PATTY",
  "HODGES, MONTE",
  "HARDER, JOSH",
  "LEE, SUMMER",
  "MAGAZINER, SETH",
  "DOGGETT, LLOYD",
  "HILL, TARTISHA A MRS.",
  "MIZEUR, HEATHER RENAY",
  "WRIGHT, GLENN",
  "RYAN, PATRICK",
  "BALINT, REBECCA 'BECCA'",
  "PUDLO, G MAEBE A. GIRL",
  "HOLLISTER, ROBERT MICHAEL III",
  "PFAFF, BRAD"
)

nones22 <- c(
  "CURTIS, CLINT",
  "BUDZINSKI, NIKKI",
  "LINDERMAN, HANK",
  "LEE LAKE, JEANNINE L",
  "LYRAS, LOUIS GEORGE",
  "ESRATI, DAVID",
  "DELLA PIA, MAX HAROLD",
  "MARKEL, HOMER CHIP MR.",
  "MCCORMICK, RAY",
  "FYFE, MATTHEW JOHN",
  "YETTER, JOSEPH FREDERICK III",
  "OLIVO, CHRISTINE ALEXANDRIA",
  "JOHNSON-GREEN, TABITHA",
  "CASTELLI, MATT",
  "PARRISH, BERNARD KYLE",
  "BUTLER, DARRIUS MAURICE",
  "MEADOWS, SAMANTHA",
  "WOOD, MARISA",
  "PETERSON, KAYLEE JADE",
  "SCHMIDT, PATRICK",
  "KLUSSMANN, DUNCAN FOSTER",
  "BROWN, MARY KATHLEEN",
  "TERRY, JOANNE R",
  "HAYS, LAUREN",
  "NORMAN, WENDY ANN",
  "SCHOLTEN, HILLARY",
  "MANN, BETHANY EILEEN",
  "KILBOY, MATTHEW",
  "ZIMMERMAN, ROBERT",
  "FROST, MAXWELL ALEJANDRO",
  "HODGE, JEVIN D",
  "WARNER-STANTON, KATHY L",
  "VEASEY, KAREN LIN MRS",
  "SORENSEN, ERIC",
  "BARKLEY, MICHAEL JAMES",
  "JOSEPHSON, GARY",
  "HERRING, WADE WILKES MR. II",
  "ALEXANDER, TIMOTHY C",
  "MARTIN, HENRY ROBERT",
  "ROE, ANN",
  "BEARD, JAMES KENNETH",
  "LEHMAN, MATTHEW",
  "CONOLE, FRANCIS",
  "JONES, KERMIT",
  "CHRISTIAN, ROBERT FRANKLIN MR. III",
  "KALE, ANDREA DORIA",
  "BALDERRAMOS ROBINSON, CORINNA",
  "ROLLINS, WILL",
  "JONES, REBEKAH",
  "WENDELL, BARRY LEE MR.",
  "JONES, LAURA",
  "GREEN, KAREN REV. DR.",
  "BLACK, DIANNE DODSON",
  "MUNNS, SHANTE",
  "TRANEL, MONICA",
  "HAWK, DANIELLE NICOLE",
  "FORD, MICHAEL C. MR.",
  "STEURY, PAUL MR.",
  "GOLDMAN, DANIEL",
  "FLOWERS, MARCUS",
  "LORINSER, ROBERT JAMES",
  "GOLDBERG, BARTLEY F.",
  "RADAKER-SHEAFER, KRISTEN",
  "GASKINS, BARBARA D MS.",
  "HOLDEN, STEVEN WESLEY SR.",
  "MCDOWELL, JAN",
  "HAIRE, MARVIN JONATHAN",
  "HARVEY-HALL, PHYLLIS",
  "AUSMAN, RICHARD",
  "MCDONALD, DARLENE",
  "MCCORKLE, ISAAC IAN",
  "OMERE, IROGHAMA C",
  "MELTON, RYAN MICHAEL APPLETON",
  "JACKSON, JONATHAN",
  "BANYAI, CINDY LYN",
  "JOHNSON, ELIZABETH (LIZ)",
  "LANGE, PAUL",
  "JAYE, BRIAN STEVEN",
  "MITCHELL, NICHOLAS",
  "ANDREWS, NAOMI",
  "HARRIS-TILL, JOSHUA",
  "HADERLEIN, ELIZABETH",
  "STEINER, MAX",
  "MARSHALL, DEREK",
  "YOUNG, SHUWASKI",
  "WALKER, KIMBERLY H MS",
  "HILL, NATASHA",
  "AUSBROOKS, JIMMY CLIFTON",
  "HALBLEIB, CONOR",
  "WILLIAMS, MIKAL D MR",
  "NEIGHBORS, RICKY ALLEN",
  "WILSON, TAMIE",
  "VAN SOMEREN, MIKE",
  "COHN, ALAN MICHAEL",
  "PASTORE, DANIEL",
  "BRANNON, MARY",
  "EHASZ, ASHLEY",
  "WALDMAN, AMANDA",
  "HOLLOWAY, LASHONDA J",
  "JEFFERSON, JRMAR",
  "DELUZIO, CHRISTOPHER",
  "LYNN, ERIC",
  "ALMONORD, VALBRUN",
  "YOUNG, GEOFFREY M",
  "HUNT, TREY JOSEPH",
  "JENKINS, MATTHEW",
  "ANDREWS, ANNIE DR.",
  "SNYDER, GARY",
  "WIRTH, CYNTHIA",
  "LARKINS, JUDDSON",
  "SCHNEIDER, JAN",
  "DIEMER, MATTHEW",
  "THRONEBURG, JOSHUA",
  "MARTIN, ADAM",
  "HUFFMAN, SCOTT",
  "RILEY, JOSH",
  "GORMAN, MARY",
  "MCCALLIAN, RANDI MCCALLIAN",
  "LEWIS, JENNIFER LYNN",
  "GENANT, PAMELA",
  "STEELE, RUSSELL WAYNE MR.",
  "COOPER, RANDAL DEAN",
  "FULFORD, ROBIN DAWN",
  "HILLIARD, JEROME C",
  "GLUESENKAMP PEREZ, MARIE",
  "WHITE, DOUG"
)

# TAMIE/TAMARA WILSON, NICHOLAS MITCHELL duplicated

dat <- dat %>%
  group_by(candidate) %>%
  arrange(cycle) %>%
  mutate(held_office = case_when(experience=="Held Elected Office"~1,
                                 !is.na(experience)&experience!="Held Elected Office"~0,
                                 T~NA_real_)) %>%
  fill(held_office) %>%
  ungroup() %>%
  mutate(experience = case_when(cycle==2020&held_office==1~"Held Elected Office",
                                cycle%in%c(2020, 2022)&candidate%in%heldoffices20~"Held Elected Office",
                                cycle==2020&candidate%in%nones20~"None",
                                cycle==2022&candidate%in%heldoffices22~"Held Elected Office",
                                cycle%in%c(2020, 2022)&candidate%in%nones22~"None",
                                T~experience))



# read in Cook House ratings
cook <- read.csv("data/raw/cook_house_ratings_stacked_2010-2022.csv") %>%
  mutate(statedis = case_when(substr(district, 3, 3)=="0"~gsub("0", "", district),
                              T~district),
         cycle = year) %>%
  select(cook_dir, cook_folded, statedis, cycle)


dat <- left_join(dat, cook)

# in 2010, races that were considered safe for the incumbent's party are not coded. So take partisanship of the winner in 2010 and code as safe for that party.
dat <- dat %>%
  mutate(cook_dir = case_when(is.na(cook_dir)&win==1&party=="Democratic"~-1,
                              is.na(cook_dir)&win==1&party=="Republican"~1,
                              is.na(cook_dir)&statedis%in%c("NY10", "PA1")~-1,
                              is.na(cook_dir)&statedis=="WA5"~1,
                              T~cook_dir),
         cook_folded = case_when(is.na(cook_folded)~0,
                                 T~cook_folded)) %>%
  group_by(statedis, cycle) %>%
  fill(cook_dir, .direction = "updown")


dat <- dat %>% 
  mutate(win = case_when(is.na(win)&cycle==2020&candidate%in%c("KOBLE, CLINT MATTHEW MR.",
                                                               "DENHAM, JEFF",
                                                               "FRIEDENBERG, MARC",
                                                               "CLARK, JOHN",
                                                               "PHILLIPS, GEORGE KARL MR",
                                                               "THORNTON, DIERDRE",
                                                               "GERSHON, PERRY",
                                                               "TODD, IAN ANDREW",
                                                               "DONNELLY, TIMOTHY M.",
                                                               "LIPINSKI, DANIEL",
                                                               "MCDOWELL, JAN",
                                                               "BUCK, GEORGE WILLIAM PHD",
                                                               "SAGAN, GREGORY T. MR.",
                                                               "ALBRO, CATHERINE",
                                                               "WATKINS, STEVE",
                                                               "MCCREADY, DANIEL",
                                                               "GUILD, THOMAS EUGENE",
                                                               "MCCANN, JOHN JOSEPH MR. JR.",
                                                               "GEPPERT, KATY",
                                                               "CONAWAY, ROBERT DEAN",
                                                               "HOLDEN, DAVID",
                                                               "MCLEOD-SKINNER, JAMIE",
                                                               "YODER, KEVIN W",
                                                               "GRAYSON, DENA MD, PHD",
                                                               "HUGHES, DAVID RUSSELL MR.",
                                                               "GAITHER, KEVIN",
                                                               "TRUNDLE, RYAN",
                                                               "FULLER, VIRGINIA",
                                                               "WILLIAMS, WILLIAM HENRY",
                                                               "WILLIAMS, VANGIE",
                                                               "RING, LISA M.",
                                                               "ROUSE, ERIC S",
                                                               "BURCH, LORIE LOUISE",
                                                               "PEACOCK, JULIA",
                                                               "TIPTON, SCOTT",
                                                               "PRICE, PHILLIP",
                                                               "FOX, DR. RICHARD B",
                                                               "RATCLIFFE, JOHN L",
                                                               "MARTIN, HENRY ROBERT",
                                                               "ENGEL, ELIOT",
                                                               "AMASH, JUSTIN",
                                                               "ROBINSON, EMILY",
                                                               "CUMMINGS, MAYA ROCKEYMOORE",
                                                               "FAWELL, WILLIAM W",
                                                               "SPANO, VINCENT ROSS",
                                                               "WRIGHT, KENNETH W DR.",
                                                               "GAYOT, LUTCHI",
                                                               "BRAT, DAVID ALAN",
                                                               "BENTIVOLIO, KERRY",
                                                               "KNIGHT, STEVE",
                                                               "ASHFORD, ANN",
                                                               "RIGGLEMAN, DENVER LEE MR. III",
                                                               "ROBINSON, ARTHUR BROUHARD",
                                                               "KING, STEVE MR.",
                                                               "CLAY, WILLIAM LACY JR",
                                                               "WESTLEY, TIMMY LEE")~0,
                         T~win))

# get vote shares
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/IG0UN2
votes <- read.csv("data/raw/1976-2020-housevotes.csv") %>%
  filter(year >= 2010 & party %in% c("REPUBLICAN", "DEMOCRAT")) %>%
  mutate(district = ifelse(district==0,1, district),
         statedis = paste(state_po, district, sep = ""),
         cycle = year,
         party = case_when(party=="REPUBLICAN"~"Republican",
                           party=="DEMOCRAT"~"Democratic")) %>%
  select(statedis, cycle, party, candidate, candidatevotes, totalvotes) %>%
  group_by(statedis, cycle) %>%
  mutate(win = as.numeric(candidatevotes==max(candidatevotes, na.rm = T))) %>%
  group_by(statedis, cycle, party) %>%
  filter(candidatevotes==max(candidatevotes, na.rm = T)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(voteshare = candidatevotes / totalvotes) %>%
  ungroup() %>%
  select(statedis, cycle, win, voteshare)


dat <- full_join(dat, votes) %>%
  filter(!is.na(win)&pacsandothercomittees!=1956.76) # get rid of duplicated Nicholas Mitchell

# get incumbent race and party

winners <- dat %>% 
  filter(win==1) %>%
  group_by(statedis) %>%
  arrange(cycle) %>%
  mutate(race_lag = dplyr::lag(race, n = 1, default = NA),
         party_lag = dplyr::lag(party, n = 1, default = NA)) %>%
  select(statedis, cycle, race_lag, party_lag)

dat <- full_join(dat, winners)

dat <- dat %>% filter(race%in%c("white", "black")) %>% select(-c(n, held_office, seniority18, seniority20))

write.csv(dat, "data/clean/SorensenChenPRQ_cleaned.csv")


# appendix B ----

# marginal mean table

candidates <- read.csv("data/raw/study1.csv") %>%
  mutate(placement = rescale(placement, to = c(1,0)),
         placement2 = rescale(placement, to = c(1,0)),
         climate = ifelse(climate=="Promote renewables", "Promote alternatives", climate))

candidates2 <- csvy::read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE)

candidates3 <- read.csv("data/raw/study3.csv")

candidates1_temp <- candidates %>% dplyr::select(race, won, repsme, repsgroups, goodjob, goodchance, demvotes, swingvotes, X, placement, reparations, policy_payreps, exp, endorse, health, climate, job, policy_health, policy_fossil, candage, strength_dem, age, female, hhinc, educ) %>% mutate(canddistance = NA_real_, realdirdiff = NA_real_) %>% rename(chosen_candidate = won, experience = exp, endorsement = endorse, healthcare = health, policy_healthcare = policy_health) %>% mutate(sample = 1, X = paste(X, sample), candselfplacement = NA_character_, treat = NA_character_)

candidates2_temp <-
  candidates2 %>% filter(pid %in% c(5, 6, 7)) %>% mutate(
    sample = 2,
    X = paste(X, sample),
    placement = NA_real_,
    candage = case_when(
      candage == 1 ~ 44,
      candage ==
        2 ~ 47,
      candage ==
        3 ~ 50,
      candage ==
        4 ~ 55,
      candage ==
        5 ~ 58
    ),
    strength_dem = case_when(pid ==
                               7 ~ 1,
                             pid ==
                               6 ~ 0.5,
                             pid ==
                               5 ~ 0,
                             T ~
                               NA_real_),
    policy_reparations = case_when(treat=="Self-placement"~NA_real_,
                                   T~policy_reparations),
    policy_climate = case_when(treat=="Self-placement"~NA_real_,
                               T~policy_climate),
    policy_healthcare = case_when(treat=="Self-placement"~NA_real_,
                                  T~policy_healthcare),
    candselfplacement = as.character(candselfplacement),
    candselfplacement = case_when(treat=="Self-placement"~candselfplacement, T~NA_character_),
    candselfplacement = factor(candselfplacement)
  ) %>% rename(policy_payreps = policy_reparations, policy_fossil = policy_climate) %>% dplyr::select(
    race,
    chosen_candidate,
    repsme,
    repsgroups,
    goodjob,
    goodchance,
    demvotes,
    swingvotes,
    canddistance,
    realdirdiff,
    X,
    reparations,
    policy_payreps,
    policy_healthcare,
    experience,
    endorsement,
    healthcare,
    climate,
    job,
    candselfplacement,
    treat,
    policy_fossil,
    candage,
    strength_dem,
    placement,
    sample,
    age,
    female,
    hhinc,
    educ
  )


temp <- rbind(candidates1_temp, candidates2_temp)

temp <- temp %>%
  group_by(X) %>%
  mutate(temp = sum(canddistance, na.rm = F),
         temp2 = sum(realdirdiff, na.rm = F),
         temp3 = sum(repsme, na.rm = F),
         temp4 = sum(repsgroups, na.rm = F),
         temp5 = sum(goodchance, na.rm = F),
         temp6 = sum(goodjob, na.rm = F),
         temp7 = sum(placement, na.rm = F),
         temp8 = sum(swingvotes, na.rm = F),
         temp9 = sum(demvotes, na.rm = F)) %>%
  rowwise() %>%
  mutate(other_canddistance = temp - canddistance,
         other_realdirdiff = temp2 - realdirdiff,
         other_repsme = temp3 - repsme,
         other_repsgroups = temp4 - repsgroups,
         other_goodchance = temp5 - goodchance,
         other_goodjob = temp6 - goodjob,
         other_placement = temp7 - placement,
         other_swingvotes = temp8 - swingvotes,
         other_demvotes = temp9 - demvotes) %>%
  ungroup() %>%
  mutate(canddistance_folded = round(abs(canddistance - 0.5) * 2, 5),
         other_canddistance_folded = round(abs(other_canddistance - 0.5) * 2, 5),
         realdirdiff_folded = round(abs(realdirdiff - 0.5) * 2, 5),
         other_realdirdiff_folded = round(abs(other_realdirdiff - 0.5) * 2, 5)) %>%
  rowwise() %>%
  mutate(diffdistance = canddistance - other_canddistance,
         diffdistance_folded = canddistance_folded - other_canddistance_folded,
         diffistance_real = realdirdiff - other_realdirdiff,
         diffdistance_real_folded = realdirdiff_folded - other_realdirdiff_folded,
         diffrepsme = repsme - other_repsme,
         diffrepsgroups = repsgroups - other_repsgroups,
         diffgoodchance = goodchance - other_goodchance,
         diffgoodjob = goodjob - other_goodjob,
         diffplacement = placement - other_placement,
         diffswingvotes = swingvotes - other_swingvotes,
         diffdemvotes = demvotes - other_demvotes) %>%
  ungroup() %>%
  dplyr::select(-temp) %>%
  mutate(diffdistance_real_folded = rescale(diffdistance_real_folded, to = c(1, -1)),
         diffdistance_folded = round(rescale(diffdistance_folded, to = c(1, -1)), 2),
         job = factor(job),
         experience = factor(experience),
         endorsement = factor(endorsement),
         healthcare = factor(healthcare),
         climate = factor(climate),
         reparations = factor(reparations),
         race = factor(race),
         candselfplacement = factor(candselfplacement),
         candage = factor(candage)) %>%
  group_by(X) %>%
  mutate(nrow = row_number(),
         opponent_job = case_when(nrow==1~lead(job),T~lag(job)),
         opponent_experience = case_when(nrow==1~lead(experience),T~lag(experience)),
         opponent_endorsement = case_when(nrow==1~lead(endorsement),T~lag(endorsement)),
         opponent_healthcare = case_when(nrow==1~lead(healthcare),T~lag(healthcare)),
         opponent_climate = case_when(nrow==1~lead(climate),T~lag(climate)),
         opponent_reparations = case_when(nrow==1~lead(reparations),T~lag(reparations)),
         opponent_candselfplacement = case_when(nrow==1~lead(candselfplacement),T~lag(candselfplacement)))



conjoint12 <- temp %>%
  dplyr::select(race,
                chosen_candidate,
                X,
                reparations,
                experience,
                endorsement,
                healthcare,
                climate,
                job,
                candselfplacement,
                candage,
                age,
                female,
                educ, 
                hhinc) %>%
  mutate(gender = NA) %>%
  mutate(priority = NA,
         endorsement2 = NA)

conjoint3 <- candidates3 %>%
  filter(order=="conjointfirst"&X1_pid%in%c(5,6,7)) %>%
  rename(candselfplacement = ideo,
         experience = exp,
         endorsement = group) %>%
  mutate(X = paste(X, 3),
         reparations = NA,
         healthcare = NA,
         climate = NA,
         candage = factor(candage),
         gender = NA) %>%
  dplyr::select(race,
                chosen_candidate,
                X,
                reparations,
                experience,
                endorsement,
                healthcare,
                climate,
                job,
                candselfplacement,
                candage,
                age,
                female,
                hhinc,
                educ) %>%
  mutate(priority = NA,
         endorsement2 = NA)

conjoint4 <- read.csv("data/raw/study4.csv") %>%
  filter(form=="Control") %>%
  mutate(candage = factor(candidate_age),
         climate = fossil,
         candselfplacement = NA,
         X = paste(X, 4)) %>%
  dplyr::select(race,
                chosen_candidate,
                X,
                reparations,
                experience,
                endorsement,
                healthcare,
                climate,
                job,
                candselfplacement,
                candage,
                gender,
                age,
                female,
                hhinc,
                educ) %>%
  mutate(priority = NA,
         endorsement2 = NA)

conjoint5 <- read.csv("data/raw/ca_omnibus.csv") %>%
  select(-X) %>%
  rename(resprace = race,
         respgender = gender,
         race = candrace,
         gender = candgender,
         experience = candexp,
         candselfplacement = candideo,
         endorsement = candgroup,
         job = candjob,
         X = ResponseId) %>%
  mutate(healthcare = NA,
         climate = NA,
         reparations = NA,
         female = case_when(r_sex==1~0,
                            r_sex==2~1,
                            T~NA_real_)) %>%
  dplyr::select(race,
                chosen_candidate,
                X,
                reparations,
                experience,
                endorsement,
                healthcare,
                climate,
                job,
                candselfplacement,
                candage,
                gender,
                age,
                female,
                hhinc,
                educ) %>%
  filter(race!=""&gender!="") %>%
  mutate(race = factor(race),
         candage = factor(candage),
         gender = factor(gender)) %>%
  mutate(priority = NA,
         endorsement2 = NA)

conjoint6 <- read.csv("data/raw/study5.csv") %>%
  mutate(resptype = case_when(resprace=="black"&respgender=="woman"~"Black women",
                              resprace=="black"&respgender=="man"~"Black men",
                              resprace=="white"&respgender=="woman"~"White women",
                              resprace=="white"&respgender=="man"~"White men",
                              T~NA_character_),
         resptype = factor(resptype, levels = c("White men", "White women", "Black men", "Black women"), ordered=T)) %>%
  filter(nblack==0.5&resptype%in%c("White men", "White women")) %>%
  mutate(race = factor(str_to_title(candrace)),
         candage = factor(candage),
         gender = factor(str_to_title(candgender)),
         job = case_when(candjob=="doctor"~"Doctor",
                         candjob=="business executive"~"Business executive",
                         candjob=="activist"~"Activist",
                         candjob=="high school teacher"~"High school teacher",
                         candjob=="lawyer"~"Lawyer",
                         candjob=="college professor"~"College professor"),
         job = factor(job),
         candexp = case_when(candexp=="no prior political experience"~"No prior political experience",
                             candexp=="school board member"~"School Board Member",
                             candexp=="mayor of a small city"~"Mayor of a small city",
                             candexp=="mayor of a large city"~"Mayor of a large city",
                             candexp=="state legislator"~"State Legislator",),
         experience = factor(candexp),
         priority = case_when(candpriority=="strengthen gun control through commonsense restrictions"~"gun control",
                              candpriority=="regulate co2 emissions to combat global warming"~"regulate co2 emissions",
                              candpriority=="raise taxes on those making more than $250,000 a year"~"tax the wealthy",
                              candpriority=="provide a path to citizenship for undocumented immigrants"~"path to citizenship",
                              candpriority=="promote expanding free trade agreements"~"expand free trade deals",
                              candpriority=="expand government and unemployment assistance for those in need"~"expand social safety net",
                              candpriority=="defend the rights of lgbt individuals"~"lgbt rights"),
         priority = factor(priority),
         candgroup = case_when(candgroup=="civil rights groups"~"Civil rights groups",
                               candgroup=="veterans groups"~"Veterans groups",
                               candgroup=="reproductive rights groups"~"Reproductive rights groups",
                               candgroup=="major area newspapers"~"Major area newspapers"),
         endorsement2 = factor(candgroup),
         reparations = NA,
         healthcare = NA,
         climate = NA,
         candselfplacement = NA,
         female = case_when(respgender=="woman"~1,
                            T~0),
         X = paste(X, " 5"),
         age = respage,
         educ = case_when(educ=="less than hs"~0,
                          educ=="high school degree"~0.25,
                          educ=="some college, no degree"~0.5,
                          educ=="vocational training"~0.5,
                          educ=="aa"~0.5,
                          educ=="ba"~0.75,
                          educ%in%c("doctorate", "ma or professional degree")~1,
                          T~NA_real_),
         hhinc = case_when(grepl("less|15k-|20k-", hhinc)~0,
                           grepl("30k-|35k-|40k-|45k-|50k-", hhinc)~0.25,
                           grepl("55k-|60k-|65k-|70k-|75k-|80k-", hhinc)~0.5,
                           grepl("85k-|90k-|95k-|100k-|125k-", hhinc)~0.75,
                           grepl("150k-|175k-|200k-|250k-", hhinc)~1,
                           T~NA_real_),
         endorsement = NA) %>%
  select(c(age, candage, candselfplacement, chosen_candidate, climate, educ, endorsement, endorsement2, experience, female, gender, healthcare, hhinc, job, race, reparations, X, priority))



conjoints <- rbind(conjoint12, conjoint3, conjoint4, conjoint5, conjoint6) %>%
  mutate(experience = case_when(experience=="Big-city Mayor"~"Mayor of a large city",
                                experience=="Small-city Mayor"~"Mayor of a small city", T~experience),
         healthcare = case_when(healthcare=="Americans who choose it over private health plans"~"Those who choose",
                                healthcare=="Only Americans who are older, poor, or disabled"~"Elderly, poor, and disabled",
                                T~healthcare),
         climate = case_when(climate=="Impose a tax on using fossil fuels, reducing economic growth by 3%"~"Tax fossil fuels",
                             climate=="Promote the use of renewable energy but allow continued use of fossil fuels"~"Promote alternatives",
                             climate=="Ban the use of fossil fuels after 2040, reducing economic growth by 5%"~"Ban fossil fuels",
                             T~climate),
         priority = case_when(priority=="gun control"~"Gun control",
                              priority=="regulate co2 emissions"~"Regulate Co2 emissions",
                              priority=="tax the wealthy"~"Tax the wealthy",
                              priority=="path to citizenship"~"Path to citizenship",
                              priority=="expand free trade deals"~"Expand free trade deals",
                              priority=="expand social safety net"~"Expand social safety net",
                              priority=="lgbt rights"~"LGBT rights"),
         priority = factor(priority),
         gender = factor(gender),
         race = factor(race),
         candage = factor(candage),
         job = factor(job),
         experience = factor(experience),
         healthcare = factor(healthcare),
         climate = factor(climate),
         reparations = factor(reparations),
         candselfplacement = factor(candselfplacement),
         endorsement = factor(endorsement),
         endorsement2 = factor(endorsement2),
         sample = case_when(grepl(" 1", X)~"Lucid 1",
                            grepl(" 2", X)~"Lucid 2",
                            grepl(" 3", X)~"Lucid 3",
                            grepl(" 4", X)~"Lucid 4",
                            grepl(" 5", X)~"Lucid 5",
                            T~"CA voter survey"))

mod <- chosen_candidate ~ race + gender + candage + job + experience + endorsement + endorsement2 + healthcare + climate + reparations + candselfplacement + priority
mm <- mm(conjoints, mod, id = ~X) %>%
  mutate(feature = case_when(feature=="race"~"Race",
                             feature=="gender"~"Gender",
                             feature=="candage"~"Age",
                             feature=="job"~"Occupation",
                             feature=="experience"~"Political experience",
                             feature=="endorsement"~"Endorsement",
                             feature=="endorsement2"~"Endorsement (Lucid 5)",
                             feature=="healthcare"~"Publicly funded healthcare",
                             feature=="climate"~"Fossil fuels",
                             feature=="reparations"~"Reparations",
                             feature=="candselfplacement"~"Ideological self-placement",
                             feature=="priority"~"Priority if elected"),
         feature = factor(feature, levels=c("Race", 
                                            "Gender", 
                                            "Age", 
                                            "Occupation",
                                            "Political experience", 
                                            "Endorsement", 
                                            "Endorsement (Lucid 5)",
                                            "Publicly funded healthcare",
                                            "Fossil fuels",
                                            "Reparations", 
                                            "Ideological self-placement", 
                                            "Priority if elected",
                                            ordered=T))) %>%
  group_by(feature) %>%
  arrange(estimate, .by_group = T) %>%
  mutate(row = row_number(),
         statistic = "Vote choice marginal means")

mod2 <- chosen_candidate ~ candage + gender + job + experience + endorsement + endorsement2 + healthcare + climate + reparations + candselfplacement + priority

conjoints$race <- factor(conjoints$race, levels=c("White", "Black"), ordered=T)

mmdiffs <- mm_diffs(
  conjoints,
  mod2,
  by = ~ race,
  id = ~ X,
  alpha = 0.05
) %>%
  mutate(feature = case_when(feature=="race"~"Race",
                             feature=="gender"~"Gender",
                             feature=="candage"~"Age",
                             feature=="job"~"Occupation",
                             feature=="experience"~"Political experience",
                             feature=="endorsement"~"Endorsement",
                             feature=="endorsement2"~"Endorsement (Lucid 5)",
                             feature=="healthcare"~"Publicly funded healthcare",
                             feature=="climate"~"Fossil fuels",
                             feature=="reparations"~"Reparations",
                             feature=="candselfplacement"~"Ideological self-placement",
                             feature=="priority"~"Priority if elected"),
         feature = factor(feature, levels=c("Race", 
                                            "Gender", 
                                            "Age", 
                                            "Occupation",
                                            "Political experience", 
                                            "Endorsement", 
                                            "Endorsement (Lucid 5)",
                                            "Publicly funded healthcare",
                                            "Fossil fuels",
                                            "Reparations", 
                                            "Ideological self-placement", 
                                            "Priority if elected",
                                            ordered=T))) %>%
  group_by(feature) %>%
  arrange(estimate, .by_group = T) %>%
  mutate(statistic = "Difference in marginal means\nby race") %>%
  dplyr::select(-c(BY, race))

marginals <- rbind(mm, mmdiffs) %>%
  mutate(statistic = factor(statistic, levels = c("Vote choice marginal means", "Difference in marginal means\nby race"), ordered=T)) %>%
  group_by(level) %>%
  fill(row)

mmtab <- as.data.frame(mm) %>% 
  mutate(z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         estimate = paste(round(estimate, 3), pstars, " (", round(std.error,3), ")", sep = "")) %>%
  dplyr::select(level, estimate)
mmdiffstab <- as.data.frame(mmdiffs) %>% 
  mutate(z = (estimate)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         estimate = paste(round(estimate, 3), pstars, " (", round(std.error,3), ")", sep = "")) %>%
  dplyr::select(level, estimate) %>%
  rename(diffest = estimate)
mmtab <- full_join(mmtab, mmdiffstab)

write.csv(mmtab, "data/clean/appx_conjoint_mms.csv")


# demographics table
demographics1 <- candidates %>%
  rename(chosen_candidate = won) %>%
  mutate(age = rescale(age, to = c(18,83)),
         age = case_when(age<30~"18-29",
                         age<40~"30-39",
                         age<50~"40-49",
                         age<65~"50-64",
                         age>64~"65+",
                         T~NA_character_),
         # age = factor(age, levels = c("18-29", "30-39", "40-49", "50-64", "65+"), ordered=T),
         female = case_when(female==1~"Female",
                            female==0~"Male",
                            T~NA_character_),
         # female = factor(female, levels=c("Female", "Male"), ordered=T),
         educ = case_when(educ==0~"Less than HS",
                          educ==0.25~"High school",
                          educ==0.5~"Some college",
                          educ==0.75~"Bachelor's degree",
                          educ==1~"Post-secondary degree",
                          T~NA_character_),
         # educ = factor(educ, levels=c("Less than HS", "High school", "Some college", "Bachelor's degree", "Post-secondary degree"), ordered=T),
         hhinc = case_when(hhinc==0~"$24,999 or less",
                           hhinc==0.25~"$25k-$54,999",
                           hhinc==0.5~"$55k-$79,999",
                           hhinc==0.75~"$80k-$149,999",
                           hhinc==1~"$150k or more",
                           T~NA_character_),
         # hhinc = factor(hhinc, levels=c("$24,999 or less","$25k-$54,999","$55k-$79,999",
         #                                "$80k-$149,999","$150k or more"), ordered=T),
         strength_dem = case_when(strength_dem==0~"Lean Democrat",
                                  strength_dem==0.5~"Democrat",
                                  strength_dem==1~"Strong Democrat",
                                  T~NA_character_)) %>%
  #strength_dem = factor(strength_dem, levels=c("Lean Democrat", "Democrat", "Strong Democrat"), ordered=T)) %>%
  filter(race=="Black") %>%
  dplyr::select(chosen_candidate, age, female, educ, hhinc, region, strength_dem) %>%
  mutate(sample = "Lucid 1")

demographics2 <- candidates2 %>%
  filter(pid%in%c(5,6,7)) %>%
  mutate(pid = rescale(pid, to = c(0,1))) %>%
  rename(strength_dem = pid) %>%
  mutate(age = rescale(age, to = c(18,91)),
         age = case_when(age<30~"18-29",
                         age<40~"30-39",
                         age<50~"40-49",
                         age<65~"50-64",
                         age>64~"65+",
                         T~NA_character_),
         # age = factor(age, levels = c("18-29", "30-39", "40-49", "50-64", "65+"), ordered=T),
         female = case_when(female==1~"Female",
                            female==0~"Male",
                            T~NA_character_),
         # female = factor(female, levels=c("Female", "Male"), ordered=T),
         educ = case_when(educ==0~"Less than HS",
                          educ==0.25~"High school",
                          educ==0.5~"Some college",
                          educ==0.75~"Bachelor's degree",
                          educ==1~"Post-secondary degree",
                          T~NA_character_),
         # educ = factor(educ, levels=c("Less than HS", "High school", "Some college", "Bachelor's degree", "Post-secondary degree"), ordered=T),
         hhinc = case_when(hhinc==0~"$24,999 or less",
                           hhinc==0.25~"$25k-$54,999",
                           hhinc==0.5~"$55k-$79,999",
                           hhinc==0.75~"$80k-$149,999",
                           hhinc==1~"$150k or more",
                           T~NA_character_),
         # hhinc = factor(hhinc, levels=c("$24,999 or less","$25k-$54,999","$55k-$79,999",
         #                                "$80k-$149,999","$150k or more"), ordered=T),
         strength_dem = case_when(strength_dem==0~"Lean Democrat",
                                  strength_dem==0.5~"Democrat",
                                  strength_dem==1~"Strong Democrat",
                                  T~NA_character_)) %>%
  # strength_dem = factor(strength_dem, levels=c("Lean Democrat", "Democrat", "Strong Democrat"), ordered=T)) %>%
  filter(race=="Black") %>%
  dplyr::select(chosen_candidate, age, female, educ, hhinc, region, strength_dem) %>%
  mutate(sample = "Lucid 2")

demographics3 <- candidates3 %>%
  mutate(age = rescale(age, to = c(18, 93)),
         age = case_when(age<30~"18-29",
                         age<40~"30-39",
                         age<50~"40-49",
                         age<65~"50-64",
                         age>64~"65+",
                         T~NA_character_),
         # age = factor(age, levels = c("18-29", "30-39", "40-49", "50-64", "65+"), ordered=T),
         female = case_when(female==1~"Female",
                            female==0~"Male",
                            T~NA_character_),
         # female = factor(female, levels=c("Female", "Male"), ordered=T),
         educ = case_when(educ==0~"Less than HS",
                          educ==0.25~"High school",
                          educ==0.5~"Some college",
                          educ==0.75~"Bachelor's degree",
                          educ==1~"Post-secondary degree",
                          T~NA_character_),
         # educ = factor(educ, levels=c("Less than HS", "High school", "Some college", "Bachelor's degree", "Post-secondary degree"), ordered=T),
         hhinc = case_when(hhinc==0~"$24,999 or less",
                           hhinc==0.25~"$25k-$54,999",
                           hhinc==0.5~"$55k-$79,999",
                           hhinc==0.75~"$80k-$149,999",
                           hhinc==1~"$150k or more",
                           T~NA_character_),
         # hhinc = factor(hhinc, levels=c("$24,999 or less","$25k-$54,999","$55k-$79,999",
         #                                "$80k-$149,999","$150k or more"), ordered=T),
         strength_dem = case_when(X1_pid==5~"Lean Democrat",
                                  X1_pid==6~"Democrat",
                                  X1_pid==7~"Strong Democrat",
                                  T~NA_character_),
         region = case_when(region==1~"Northeast",
                            region==2~"Midwest",
                            region==3~"South",
                            region==4~"West")) %>%
  filter(race=="Black"&!is.na(strength_dem)) %>%
  dplyr::select(chosen_candidate, age, female, educ, hhinc, region, strength_dem) %>%
  mutate(sample = "Lucid 3")

demographics4 <- read.csv("data/raw/study4.csv") %>%
  filter(form=="Control"&race=="Black") %>%
  mutate(strength_dem = ifelse(pid==5, 0,
                               ifelse(pid==6,0.5,
                                      ifelse(pid==7,1,
                                             NA)))) %>%
  mutate(age = rescale(age, to = c(18, 83)),
         age = case_when(age<30~"18-29",
                         age<40~"30-39",
                         age<50~"40-49",
                         age<65~"50-64",
                         age>64~"65+",
                         T~NA_character_),
         # age = factor(age, levels = c("18-29", "30-39", "40-49", "50-64", "65+"), ordered=T),
         female = case_when(female==1~"Female",
                            female==0~"Male",
                            T~NA_character_),
         # female = factor(female, levels=c("Female", "Male"), ordered=T),
         educ = case_when(educ==0~"Less than HS",
                          educ==0.25~"High school",
                          educ==0.5~"Some college",
                          educ==0.75~"Bachelor's degree",
                          educ==1~"Post-secondary degree",
                          T~NA_character_),
         # educ = factor(educ, levels=c("Less than HS", "High school", "Some college", "Bachelor's degree", "Post-secondary degree"), ordered=T),
         hhinc = case_when(hhinc==0~"$24,999 or less",
                           hhinc==0.25~"$25k-$54,999",
                           hhinc==0.5~"$55k-$79,999",
                           hhinc==0.75~"$80k-$149,999",
                           hhinc==1~"$150k or more",
                           T~NA_character_),
         # hhinc = factor(hhinc, levels=c("$24,999 or less","$25k-$54,999","$55k-$79,999",
         #                                "$80k-$149,999","$150k or more"), ordered=T),
         strength_dem = case_when(strength_dem==0~"Lean Democrat",
                                  strength_dem==0.5~"Democrat",
                                  strength_dem==1~"Strong Democrat",
                                  T~NA_character_)) %>%
  dplyr::select(chosen_candidate, age, female, educ, hhinc, strength_dem, region) %>%
  mutate(sample = "Lucid 4")

demographics5 <- read.csv("data/raw/study5.csv") %>%
  filter(candrace=="black"&resprace=="white"&nblack==0.5) %>%
  mutate(age = case_when(respage<30~"18-29",
                         respage<40~"30-39",
                         respage<50~"40-49",
                         respage<65~"50-64",
                         respage>64~"65+",
                         T~NA_character_),
         female = case_when(respgender=="woman"~"Female", T~"Male"),
         educ = case_when(educ=="less than hs"~"Less than HS",
                          educ=="high school degree"~"High school",
                          educ%in%c("some college, no degree", "vocational training")~"Some college",
                          educ=="ba"~"Bachelor's degree",
                          educ%in%c("doctorate", "ma or professional degree")~"Post-secondary degree"),
         hhinc = case_when(grepl("less|15k-|20k-", hhinc)~"$24,999 or less",
                           grepl("30k-|35k-|40k-|45k-|50k-", hhinc)~"$25k-$54,999",
                           grepl("55k-|60k-|65k-|70k-|75k-|80k-", hhinc)~"$55k-$79,999",
                           grepl("85k-|90k-|95k-|100k-|125k-", hhinc)~"$80k-$149,999",
                           grepl("150k-|175k-|200k-|250k-", hhinc)~"$150k or more",
                           T~NA_character_),
         
         strength_dem = NA) %>%
  select(c(chosen_candidate, age, female, educ, hhinc, strength_dem, region)) %>%
  mutate(sample = "Lucid 5")

demographics6 <- read.csv("data/raw/ca_omnibus.csv") %>%
  filter(race=="white"&pid=="Democrat"&candrace=="Black") %>%
  mutate(age = case_when(age<30~"18-29",
                         age<40~"30-39",
                         age<50~"40-49",
                         age<65~"50-64",
                         age>64~"65+",
                         T~NA_character_),
         female = gender,
         educ = case_when(r_education==1~"Less than HS",
                          r_education==2~"High school",
                          r_education%in%c(3,4)~"Some college",
                          r_education==5~"Bachelor's degree",
                          r_education==6~"Post-secondary degree",
                          T~NA_character_),
         hhinc = case_when(r_income==1~"$24,999 or less",
                           r_income==2~"$25k-$49,999 (CA sample)",
                           r_income==3~"$50k-$74,999 (CA sample)",
                           r_income==4~"$75k-$99,999 (CA sample)",
                           r_income==5~"$100k-$149,999 (CA sample)",
                           r_income==6~"$150k or more",
                           T~NA_character_),
         region = "West",
         sample = "California voter study",
         strength_dem = NA) %>%
  select(chosen_candidate, age, female, educ, hhinc, region, strength_dem) %>%
  mutate(sample = "California voter study")

dems <- rbind(demographics1, demographics2, demographics3, demographics4, demographics5, demographics6) %>%
  dplyr::select(-c(strength_dem))

dems$female <- factor(dems$female, levels=c("Male", "Female"), ordered=T)
dems$educ <- factor(dems$educ, levels=c("Less than HS",
                                        "High school",
                                        "Some college",
                                        "Bachelor's degree",
                                        "Post-secondary degree"),
                    ordered=T)
dems$hhinc <- factor(dems$hhinc, levels = c("$24,999 or less",
                                            "$25k-$54,999",
                                            "$55k-$79,999",
                                            "$80k-$149,999",
                                            "$150k or more",
                                            "$25k-$49,999 (CA sample)",
                                            "$50k-$74,999 (CA sample)",
                                            "$75k-$99,999 (CA sample)",
                                            "$100k-$149,999 (CA sample)"
),
ordered=T)
dems$region <- factor(dems$region, levels = c("Northeast", "Midwest", "South", "West"), ordered = T)

mod_age <- tidy(lm(chosen_candidate ~ factor(age, ordered=F) + 0, dems)) %>%
  mutate(term = gsub("factor\\(age, ordered = F\\)", "", term),
         z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         age.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, age.estimate))

n_age <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(age, ordered=F) + 0, dems))))
colnames(n_age) <- c("term", "age.estimate")

mod_fem <- tidy(lm(chosen_candidate ~ factor(female, ordered=F) + 0, dems)) %>%
  mutate(term = gsub("factor\\(female, ordered = F\\)", "", term),
         z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         fem.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, fem.estimate))
n_fem <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(female, ordered=F) + 0, dems))))
colnames(n_fem) <- c("term", "fem.estimate")


mod_hhinc <- tidy(lm(chosen_candidate ~ factor(hhinc, ordered=F) + 0, dems)) %>%
  mutate(term = gsub("factor\\(hhinc, ordered = F\\)", "", term),
         z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         hhinc.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, hhinc.estimate))
n_hhinc <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(hhinc, ordered=F) + 0, dems))))
colnames(n_hhinc) <- c("term", "hhinc.estimate")

mod_educ <- tidy(lm(chosen_candidate ~ factor(educ, ordered=F) + 0, dems)) %>%
  mutate(term = gsub("factor\\(educ, ordered = F\\)", "", term),
         z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         educ.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, educ.estimate))
n_educ <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(educ, ordered=F) + 0, dems))))
colnames(n_educ) <- c("term", "educ.estimate")

mod_region <- tidy(lm(chosen_candidate ~ factor(region, ordered=F) + 0, dems)) %>%
  mutate(term = gsub("factor\\(region, ordered = F\\)", "", term),
         z = (estimate - 0.5)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         region.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, region.estimate))
n_region <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(region, ordered=F) + 0, dems))))
colnames(n_region) <- c("term", "region.estimate")

mod_alldems <- tidy(lm(chosen_candidate ~ factor(age, ordered=F) + factor(female, ordered=F) + factor(hhinc, ordered=F) + factor(educ, ordered=F) + factor(region, ordered = F), dems)) %>%
  mutate(term = gsub("factor\\(.*, ordered = F\\)", "", term),
         z = (estimate)/std.error,
         pval = exp(-0.717*z - 0.416*z^2),
         pstars = case_when(pval < 0.001~"***",
                            pval < 0.01~"**",
                            pval < 0.05~"*",
                            T~""),
         alldems.estimate = paste(round(estimate, 3), pstars, " (", round(std.error, 3), ")", sep = "")) %>%
  dplyr::select(c(term, alldems.estimate))
n_alldems <- as.data.frame(cbind("N", nobs(lm(chosen_candidate ~ factor(age, ordered=F) + factor(female, ordered=F) + factor(hhinc, ordered=F) + factor(educ, ordered=F) + factor(region, ordered = F), dems))))
colnames(n_alldems) <- c("term", "alldems.estimate")

all_cols <- as.data.frame(c(mod_age$term, mod_fem$term, mod_hhinc$term, mod_educ$term))
colnames(all_cols) <- "term"

mod_age <- full_join(mod_age, all_cols)
mod_fem <- full_join(mod_fem, all_cols)
mod_hhinc <- full_join(mod_hhinc, all_cols)
mod_educ <- full_join(mod_educ, all_cols)
mod_region <- full_join(mod_region, all_cols)
mod_alldems <- full_join(mod_alldems, all_cols)

mods <- full_join(mod_age, mod_fem) %>%
  full_join(mod_hhinc) %>%
  full_join(mod_educ) %>%
  full_join(mod_region) %>%
  full_join(mod_alldems)

ns <- full_join(n_age, n_fem) %>%
  full_join(n_hhinc) %>%
  full_join(n_educ) %>%
  full_join(n_region) %>%
  full_join(n_alldems)

mods <- rbind(mods, ns)

write.csv(dems, "data/clean/original_study_demographics.csv")
write.csv(mods, "data/clean/appx_demtab.csv")

attitudestab <- read.csv("data/raw/study1.csv") %>%
  filter(race=="Black") %>%
  mutate(chosen_candidate = as.numeric(won),
         age = rescale(age, to = c(18, 83)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = tolower(region),
         gender = case_when(female==0~1, female==1~2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup() %>%
  select(c(chosen_candidate, weight, disc_black, rr_index, selfmon_index, age, region, gender, educ, hhinc))

attitudestab2 <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  filter(race=="Black") %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup() %>%
  select(c(chosen_candidate, weight, therm_trump, therm_biden, selfmon_index, age, region, gender, educ, hhinc))

attitudestab3 <- plyr::rbind.fill(attitudestab, attitudestab2)

write.csv(attitudestab3, "data/clean/attitudestab.csv")

dat_ideo1 <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  filter(!ideo%in%c(1,2)) %>% # remove participants who are more conservative than any of the candidates on offer
  rowwise() %>%
  mutate(real_ideo_distance = ideo - candselfplacement_numfull, # positive values -> R more liberal than candidate
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(X) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candselfplacement_numfull)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candselfplacement_numfull = temp2 - candselfplacement_numfull,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candselfplacement_numfull - other_candselfplacement_numfull) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         strength_dem = case_when(pid==7~1,
                                  pid==6~0.5,
                                  pid==5~0,
                                  T~NA_real_)) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, region, gender) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2))

dat_ideo2 <- read.csv("data/raw/ca_omnibus.csv") %>%
  filter(liberal>0.3) %>%
  mutate(candideo_num = case_when(candideo=="Very liberal"~1,
                                  candideo=="Liberal"~5/6,
                                  candideo=="Somewhat liberal"~2/3,
                                  candideo=="Moderate"~0.5,
                                  candideo=="Somewhat conservative"~1/3,
                                  T~NA_real_),
         region = "west",
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<150~"60p"),
         gender = case_when(gender=="Male"~1, gender=="Female"~2)) %>%
  rowwise() %>%
  mutate(real_ideo_distance = liberal - candideo_num,
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(ResponseId) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candideo_num)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candideo_num = temp2 - candideo_num,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candideo_num - other_candideo_num) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         race = candrace,
         X = ResponseId) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, gender, region)


dat_ideo <- rbind(dat_ideo1, dat_ideo2) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup()


mod1 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="Black",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "Black", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mod2 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="White",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "White", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mods <- rbind(mod1, mod2) %>%
  arrange(desc(term)) %>%
  ungroup() %>%
  mutate(z = abs(estimate - 0.5)/std.error,
         diff_from_50 = exp(-0.717*z - 0.416*z^2),
         diff_from_50_stars = case_when(diff_from_50 < 0.001~"***",
                                        diff_from_50 < 0.01~"**",
                                        diff_from_50 < 0.05~"*",
                                        T~""))

ymax <- mods %>%
  group_by(term) %>%
  summarize(y.position = max(conf.high) + 0.05)

modsp <- dat_ideo %>%
  filter(!is.na(rel_ideo_distance)) %>%
  group_by(rel_ideo_distance) %>%
  do(estimate = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,1],
     std.error = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         z = estimate/std.error,
         p = exp(-0.717*z - 0.416*z^2)) %>%
  ungroup() %>%
  mutate(group1 = "White",
         group2 = "Black",
         diff = round(estimate, 3),
         plab = case_when(p < 0.001 ~ " < 0.001",
                          T~paste(" = ", round(p, 3), sep = "")),
         p.signif = case_when(p < 0.001 ~ "***",
                              p < 0.01 ~ "**",
                              p < 0.05 ~ "*",
                              T~ ""),
         term = rel_ideo_distance,
         xmin = term - 0.03,
         xmax = term + 0.03) %>%
  full_join(ymax)

ns <- dat_ideo %>%
  filter(!is.na(rel_ideo_distance)) %>%
  group_by(rel_ideo_distance, race) %>%
  summarize(n = round(sum(weight), 2)) %>%
  rename(term = rel_ideo_distance,
         model = race)

modsp2 <- modsp %>%
  mutate(estimate = paste(round(estimate, 3), p.signif, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, estimate)

ideotab <- mods %>%
  full_join(ns) %>%
  mutate(estimate = paste(round(estimate, 3), diff_from_50_stars, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, model, estimate, n) %>%
  pivot_wider(names_from = model, values_from = c(estimate, n)) %>%
  select(term, estimate_Black, n_Black, estimate_White, n_White) %>%
  full_join(modsp2) %>%
  arrange(term)

dat_ideo1 <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  rowwise() %>%
  mutate(real_ideo_distance = ideo - candselfplacement_numfull, # positive values -> R more liberal than candidate
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(X) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candselfplacement_numfull)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candselfplacement_numfull = temp2 - candselfplacement_numfull,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candselfplacement_numfull - other_candselfplacement_numfull) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         strength_dem = case_when(pid==7~1,
                                  pid==6~0.5,
                                  pid==5~0,
                                  T~NA_real_)) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, region, gender) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2))

dat_ideo2 <- read.csv("data/raw/ca_omnibus.csv") %>%
  mutate(candideo_num = case_when(candideo=="Very liberal"~1,
                                  candideo=="Liberal"~5/6,
                                  candideo=="Somewhat liberal"~2/3,
                                  candideo=="Moderate"~0.5,
                                  candideo=="Somewhat conservative"~1/3,
                                  T~NA_real_),
         region = "west",
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<150~"60p"),
         gender = case_when(gender=="Male"~1, gender=="Female"~2)) %>%
  rowwise() %>%
  mutate(real_ideo_distance = liberal - candideo_num,
         abs_ideo_distance = abs(real_ideo_distance)) %>% 
  ungroup() %>%
  group_by(ResponseId) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candideo_num)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candideo_num = temp2 - candideo_num,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candideo_num - other_candideo_num) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         race = candrace,
         X = ResponseId) %>% 
  dplyr::select(chosen_candidate, rel_ideo_distance, abs_ideo_distance, race, X, age_group, gender, region)


dat_ideo <- rbind(dat_ideo1, dat_ideo2) %>%
  mutate(rel_ideo_distance = round(rel_ideo_distance, 2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight) # weights are trimmed to 5 #https://zacharylhertz.github.io/posts/2022/05/weighting-surveys
  ) %>%
  ungroup() %>%
  group_by(group, weightgroup) %>%
  mutate(weight = weight / mean(weight, na.rm = T)) %>%
  ungroup()


mod1 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="Black",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "Black", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mod2 <- tidy(lm(chosen_candidate ~ factor(rel_ideo_distance) + 0, weights = weight, dat_ideo[dat_ideo$race=="White",]), conf.int = T) %>% filter(grepl("ideo", term)) %>% mutate(model = "White", term = gsub("factor\\(rel_ideo_distance\\)", "", term), term = as.numeric(term))
mods <- rbind(mod1, mod2) %>%
  arrange(desc(term)) %>%
  ungroup() %>%
  mutate(z = abs(estimate - 0.5)/std.error,
         diff_from_50 = exp(-0.717*z - 0.416*z^2),
         diff_from_50_stars = case_when(diff_from_50 < 0.001~"***",
                                        diff_from_50 < 0.01~"**",
                                        diff_from_50 < 0.05~"*",
                                        T~""))

ymax <- mods %>%
  group_by(term) %>%
  summarize(y.position = max(conf.high) + 0.05)

modsp <- dat_ideo %>%
  filter(!is.na(rel_ideo_distance)) %>%
  group_by(rel_ideo_distance) %>%
  do(estimate = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,1],
     std.error = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         z = estimate/std.error,
         p = exp(-0.717*z - 0.416*z^2)) %>%
  ungroup() %>%
  mutate(group1 = "White",
         group2 = "Black",
         diff = round(estimate, 3),
         plab = case_when(p < 0.001 ~ " < 0.001",
                          T~paste(" = ", round(p, 3), sep = "")),
         p.signif = case_when(p < 0.001 ~ "***",
                              p < 0.01 ~ "**",
                              p < 0.05 ~ "*",
                              T~ ""),
         term = rel_ideo_distance,
         xmin = term - 0.03,
         xmax = term + 0.03) %>%
  full_join(ymax)

ns <- dat_ideo %>%
  filter(!is.na(rel_ideo_distance)) %>%
  group_by(rel_ideo_distance, race) %>%
  summarize(n = round(sum(weight), 2)) %>%
  rename(term = rel_ideo_distance,
         model = race)

modsp2 <- modsp %>%
  mutate(estimate = paste(round(estimate, 3), p.signif, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, estimate)

ideotab2 <- mods %>%
  full_join(ns) %>%
  mutate(estimate = paste(round(estimate, 3), diff_from_50_stars, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, model, estimate, n) %>%
  pivot_wider(names_from = model, values_from = c(estimate, n)) %>%
  select(term, estimate_Black, n_Black, estimate_White, n_White) %>%
  full_join(modsp2) %>%
  arrange(term)

dat_ideo_perc <- read_csvy("data/raw/study2.csv", stringsAsFactors = TRUE) %>%
  mutate(age = rescale(age, to = c(18, 97)),
         age_group = case_when(age<30~"18-29",
                               age<40~"30-39",
                               age<50~"40-49",
                               age<60~"50-59",
                               age<100~"60p"),
         region = case_when(region!=""~tolower(region), T~NA_character_),
         gender = case_when(female==0~1, female==1~2)) %>%
  filter(!ideo%in%c(1,2)) %>% # remove participants who are more conservative than any of the candidates on offer
  rowwise() %>%
  mutate(real_ideo_distance = ideo - candselfplacement_numfull, # positive values -> R more liberal than candidate
         abs_ideo_distance = abs(real_ideo_distance),
         percdistance_folded = abs(canddistance - 0.5)) %>% 
  ungroup() %>%
  mutate(percdistance_folded = rescale(percdistance_folded, to = c(0,1))) %>%
  group_by(X) %>%
  mutate(temp = sum(abs_ideo_distance, na.rm = F),
         temp2 = sum(candselfplacement_numfull),
         temp3 = sum(percdistance_folded)) %>%
  ungroup() %>%
  rowwise() %>%
  mutate(other_ideo_distance = temp - abs_ideo_distance,
         other_candselfplacement_numfull = temp2 - candselfplacement_numfull,
         rel_ideo_distance = abs_ideo_distance - other_ideo_distance,
         real_ideo_distance_cands = candselfplacement_numfull - other_candselfplacement_numfull,
         other_percdistance_folded = temp3 - percdistance_folded,
         rel_percdistance = percdistance_folded - other_percdistance_folded) %>%
  ungroup() %>%
  mutate(real_ideo_distance = rescale(real_ideo_distance, to = c(-1,1)),
         abs_ideo_distance = rescale(abs_ideo_distance, to = c(0,1)),
         rel_ideo_distance = rescale(rel_ideo_distance, to = c(1,-1)),
         real_ideo_distance_cands = rescale(real_ideo_distance_cands, to = c(-1,1)),
         rel_ideo_incongruence = rescale(abs_ideo_distance, to = c(1,0)),
         rel_percdistance = rescale(rel_percdistance, to = c(1,0)),
         strength_dem = case_when(pid==7~1,
                                  pid==6~0.5,
                                  pid==5~0,
                                  T~NA_real_),
         rel_percdistance = round(rel_percdistance, 2)) %>%
  mutate(weightgroup = "region x ag x gender",
         weightvar = paste(region, age_group, gender, sep = " x "),
         group = "2022 x White x Democrat") %>%
  left_join(targets) %>%
  group_by(group, weightgroup) %>%
  mutate(n = n()) %>%
  ungroup() %>%
  group_by(weightgroup, group, weightvar) %>%
  mutate(weight = grouptarget * n / n(),
         subgroup_n = n(),
         weight = case_when(weight > 5 ~ 5, 
                            weight < 0.1 ~ 0.1,
                            is.na(weight)~1,
                            T ~ weight)) %>%
  ungroup() %>%
  select(c(rel_percdistance, race, chosen_candidate, weight))

mod1 <- tidy(lm(chosen_candidate ~ factor(rel_percdistance) + 0, weights = weight, dat_ideo_perc[dat_ideo_perc$race=="Black",]), conf.int = T) %>% filter(grepl("perc", term)) %>% mutate(model = "Black", term = gsub("factor\\(rel_percdistance\\)", "", term), term = as.numeric(term))
mod2 <- tidy(lm(chosen_candidate ~ factor(rel_percdistance) + 0, weights = weight, dat_ideo_perc[dat_ideo_perc$race=="White",]), conf.int = T) %>% filter(grepl("perc", term)) %>% mutate(model = "White", term = gsub("factor\\(rel_percdistance\\)", "", term), term = as.numeric(term))
mods <- rbind(mod1, mod2) %>%
  arrange(desc(term)) %>%
  ungroup() %>%
  mutate(z = abs(estimate - 0.5)/std.error,
         diff_from_50 = exp(-0.717*z - 0.416*z^2),
         diff_from_50_stars = case_when(diff_from_50 < 0.001~"***",
                                        diff_from_50 < 0.01~"**",
                                        diff_from_50 < 0.05~"*",
                                        T~""))

modsp <- dat_ideo_perc %>%
  mutate(race = factor(race, levels = c("White", "Black"), ordered = T)) %>%
  filter(!is.na(rel_percdistance)) %>%
  group_by(rel_percdistance) %>%
  do(estimate = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,1],
     std.error = summary(lm(chosen_candidate ~ factor(race, ordered=F), weights = weight, data = .))$coefficients[2,2]) %>%
  rowwise() %>%
  mutate(estimate = as.numeric(estimate),
         std.error = as.numeric(std.error),
         upper = estimate + 1.96 * std.error,
         lower = estimate - 1.96 * std.error,
         z = estimate/std.error,
         p = exp(-0.717*z - 0.416*z^2)) %>%
  ungroup() %>%
  mutate(group1 = "White",
         group2 = "Black",
         diff = round(estimate, 3),
         plab = case_when(p < 0.001 ~ " < 0.001",
                          T~paste(" = ", round(p, 3), sep = "")),
         p.signif = case_when(p < 0.001 ~ "***",
                              p < 0.01 ~ "**",
                              p < 0.05 ~ "*",
                              T~ ""),
         term = rel_percdistance,
         xmin = term - 0.03,
         xmax = term + 0.03) %>%
  mutate(estimate = paste(round(estimate, 3), p.signif, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, estimate)

ns <- dat_ideo_perc %>%
  filter(!is.na(rel_percdistance)) %>%
  group_by(rel_percdistance, race) %>%
  summarize(n = round(sum(weight), 2)) %>%
  rename(term = rel_percdistance,
         model = race)

ideotab_perc <- mods %>%
  full_join(ns) %>%
  mutate(estimate = paste(round(estimate, 3), diff_from_50_stars, " (", round(std.error, 3), ")", sep = "")) %>%
  select(term, model, estimate, n) %>%
  pivot_wider(names_from = model, values_from = c(estimate, n)) %>%
  select(term, estimate_Black, n_Black, estimate_White, n_White) %>%
  full_join(modsp) %>%
  arrange(term) %>%
  mutate(term = rescale(term, to = c(-1,1)))

ideotab3 <- rbind(ideotab, ideotab2, ideotab_perc)

write.csv(ideotab3, "data/clean/ideotab.csv")
